diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 4acbc201ec58e..260f80a5f532e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7744,6 +7744,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); return; } + + // If this is a v2s copy src from vgpr16 to sgpr32, + // replace vgpr copy to subreg_to_reg + if (ST.useRealTrue16Insts() && Inst.isCopy() && + Inst.getOperand(1).getReg().isVirtual() && + RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { + const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1); + if (16 == RI.getRegSizeInBits(*SrcRegRC) && + 32 == RI.getRegSizeInBits(*NewDstRC)) { + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(TargetOpcode::SUBREG_TO_REG), NewDstReg) + .add(MachineOperand::CreateImm(0)) + .add(Inst.getOperand(1)) + .add(MachineOperand::CreateImm(AMDGPU::lo16)); + Inst.eraseFromParent(); + + MRI.replaceRegWith(DstReg, NewDstReg); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + return; + } + } + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); legalizeOperands(Inst, MDT); @@ -7837,6 +7860,22 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, assert(NewDstRC); NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); + + // Check useMI of NewInstr. If used by a true16 instruction, + // add a lo16 subreg access if size mismatched + if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) { + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), + E = MRI.use_end(); + I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + unsigned UseMIOpcode = UseMI.getOpcode(); + if (AMDGPU::isTrue16Inst(UseMIOpcode) && + (16 == + RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) { + I->setSubReg(AMDGPU::lo16); + } + } + } } fixImplicitOperands(*NewInstr); // Legalize the operands diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 466f28805dfcf..419f57972a485 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -1,41 +1,35 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s -# XFAIL: * -# FIXME-TRUE16 reenable after fix-sgpr-copies is updated for true16 flow --- -name: cmp_f16 +name: cvt_hi_f32_f16 body: | - bb.0.entry: - ; GCN-LABEL: name: cmp_f16 + bb.0: + ; GCN-LABEL: name: cvt_hi_f32_f16 ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]] - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY killed [[COPY]] - ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY1]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec + ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[SUBREG_TO_REG]] + ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY]].hi16, 0, 0, 0, implicit $mode, implicit $exec %0:vgpr_16 = IMPLICIT_DEF - %1:sreg_32 = IMPLICIT_DEF - %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec - %3:sreg_32 = COPY %2:vgpr_16 - nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode - %4:sreg_32_xm0_xexec = COPY $scc - %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec + %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec + %2:sreg_32 = COPY %1:vgpr_16 + %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode ... --- -name: cvt_hi_f32_f16 +name: s_or_b32 body: | bb.0: - ; GCN-LABEL: name: cvt_hi_f32_f16 + ; GCN-LABEL: name: s_or_b32 ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]] - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16 + ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec + ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec %0:vgpr_16 = IMPLICIT_DEF %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec %2:sreg_32 = COPY %1:vgpr_16 - %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode + %3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc + %4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 644c88457714b..8c5bc4a33a303 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -255,15 +255,15 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3] ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l -; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 622a335015eba..297e4f0927204 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1093,13 +1093,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 999282bf60539..ffbb9fde26e55 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -906,13 +906,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 27ec1cfadd9d2..de12f2b246f57 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -259,13 +259,13 @@ define amdgpu_kernel void @rint_v2f16( ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index e16540fec0229..1a426096da197 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -255,15 +255,15 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3] ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l -; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index ae41f4381251d..0f709b044f63a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -238,13 +238,13 @@ define amdgpu_kernel void @trunc_v2f16( ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index f6e9f152dca5e..51dfbda53ad4c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -736,43 +736,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-TRUE16-LABEL: constant_load_v16i16_align2: ; GFX12-TRUE16: ; %bb.0: ; %entry ; GFX12-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v9, 0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_clause 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v9, s[0:1] offset:16 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:12 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v9, s[0:1] offset:8 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:4 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v9, s[0:1] offset:28 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:24 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v9, s[0:1] offset:20 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v8, v9, s[0:1] -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x5 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:28 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:24 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:20 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1] offset:16 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:12 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:8 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:4 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.l -; GFX12-TRUE16-NEXT: s_clause 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v9, s[0:1] offset:30 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v9, s[0:1] offset:26 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v9, s[0:1] offset:22 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:18 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v9, s[0:1] offset:14 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v9, s[0:1] offset:10 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:6 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:2 +; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-TRUE16-NEXT: s_endpgm ; ; GFX12-FAKE16-LABEL: constant_load_v16i16_align2: diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 9dd7b946ff5bd..7339b545686f5 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -880,17 +880,17 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1066,17 +1066,17 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1245,17 +1245,17 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1428,15 +1428,15 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1609,15 +1609,15 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ;