diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 260f80a5f532e..61fda0eef6314 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7228,6 +7228,29 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) { return DeferredList.contains(MI); } +// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode, +// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add +// subreg access properly. This can be removed after we have sgpr16 in place +void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst, + MachineRegisterInfo &MRI) const { + unsigned Opcode = Inst.getOpcode(); + if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts()) + return; + + for (MachineOperand &Op : Inst.explicit_operands()) { + unsigned OpIdx = Op.getOperandNo(); + if (!OpIdx) + continue; + if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) { + unsigned RCID = get(Opcode).operands()[OpIdx].RegClass; + const TargetRegisterClass *RC = RI.getRegClass(RCID); + if (RI.getRegSizeInBits(*RC) == 16) { + Op.setSubReg(AMDGPU::lo16); + } + } + } +} + void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const { @@ -7613,6 +7636,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, .add(Inst.getOperand(0)) .add(Inst.getOperand(1)); } + legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); MachineOperand SCCOp = Inst.getOperand(SCCIdx); @@ -7682,6 +7706,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, .addImm(0) // omod .addImm(0); // opsel0 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); + legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); Inst.eraseFromParent(); @@ -7747,6 +7772,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // If this is a v2s copy src from vgpr16 to sgpr32, // replace vgpr copy to subreg_to_reg + // This can be remove after we have sgpr16 in place if (ST.useRealTrue16Insts() && Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { @@ -7785,11 +7811,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { MachineOperand Src = Inst.getOperand(1); - if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() && - Src.isReg() && RI.isVGPR(MRI, Src.getReg())) - NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16); - else - NewInstr->addOperand(Src); + NewInstr->addOperand(Src); } if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { @@ -7863,6 +7885,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // Check useMI of NewInstr. If used by a true16 instruction, // add a lo16 subreg access if size mismatched + // This can be remove after we have sgpr16 in place if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), E = MRI.use_end(); @@ -7878,6 +7901,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } } fixImplicitOperands(*NewInstr); + + legalizeOperandsVALUt16(*NewInstr, MRI); + // Legalize the operands legalizeOperands(*NewInstr, MDT); if (NewDstReg) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 79ef1432d512a..d63225c067c9d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1279,6 +1279,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// was moved to VGPR. \returns true if succeeded. bool moveFlatAddrToVGPR(MachineInstr &Inst) const; + /// Fix operands in Inst to fix 16bit SALU to VALU lowering. + void legalizeOperandsVALUt16(MachineInstr &Inst, + MachineRegisterInfo &MRI) const; + /// Replace the instructions opcode with the equivalent VALU /// opcode. This function will also move the users of MachineInstruntions /// in the \p WorkList to the VALU if necessary. If present, \p MDT is diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 419f57972a485..137a9aaea6a77 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -1,6 +1,26 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s +--- +name: cmp_f16 +body: | + bb.0.entry: + ; GCN-LABEL: name: cmp_f16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16 + ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec + %0:vgpr_16 = IMPLICIT_DEF + %1:sreg_32 = IMPLICIT_DEF + %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec + %3:sreg_32 = COPY %2:vgpr_16 + nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode + %4:sreg_32_xm0_xexec = COPY $scc + %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec +... + --- name: cvt_hi_f32_f16 body: | diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index 23e4b80b61f69..8bc8eefad6bf7 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -1,19 +1,26 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow -# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=REAL16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=FAKE16 %s --- name: fmac_f16 body: | bb.0: - ; GCN-LABEL: name: fmac_f16 - ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec + ; REAL16-LABEL: name: fmac_f16 + ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec + ; REAL16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; + ; FAKE16-LABEL: name: fmac_f16 + ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; FAKE16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec + ; FAKE16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; FAKE16-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec %0:vgpr_32 = IMPLICIT_DEF %1:sreg_32 = IMPLICIT_DEF %2:sreg_32 = IMPLICIT_DEF