Skip to content

[AMDGPU][True16][CodeGen] legalize operands when move16bit SALU to VALU #133985

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 31 additions & 5 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7228,6 +7228,29 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
return DeferredList.contains(MI);
}

// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
// subreg access properly. This can be removed after we have sgpr16 in place
void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
MachineRegisterInfo &MRI) const {
unsigned Opcode = Inst.getOpcode();
if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
return;

for (MachineOperand &Op : Inst.explicit_operands()) {
unsigned OpIdx = Op.getOperandNo();
if (!OpIdx)
continue;
if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
if (RI.getRegSizeInBits(*RC) == 16) {
Op.setSubReg(AMDGPU::lo16);
}
}
}
}

void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {

Expand Down Expand Up @@ -7613,6 +7636,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.add(Inst.getOperand(0))
.add(Inst.getOperand(1));
}
legalizeOperandsVALUt16(*NewInstr, MRI);
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
MachineOperand SCCOp = Inst.getOperand(SCCIdx);
Expand Down Expand Up @@ -7682,6 +7706,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.addImm(0) // omod
.addImm(0); // opsel0
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
legalizeOperandsVALUt16(*NewInstr, MRI);
legalizeOperands(*NewInstr, MDT);
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
Inst.eraseFromParent();
Expand Down Expand Up @@ -7747,6 +7772,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,

// If this is a v2s copy src from vgpr16 to sgpr32,
// replace vgpr copy to subreg_to_reg
// This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
Inst.getOperand(1).getReg().isVirtual() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
Expand Down Expand Up @@ -7785,11 +7811,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
NewInstr.addImm(0);
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
MachineOperand Src = Inst.getOperand(1);
if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
else
NewInstr->addOperand(Src);
NewInstr->addOperand(Src);
}

if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
Expand Down Expand Up @@ -7863,6 +7885,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,

// Check useMI of NewInstr. If used by a true16 instruction,
// add a lo16 subreg access if size mismatched
// This can be remove after we have sgpr16 in place
if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
E = MRI.use_end();
Expand All @@ -7878,6 +7901,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
}
}
fixImplicitOperands(*NewInstr);

legalizeOperandsVALUt16(*NewInstr, MRI);

// Legalize the operands
legalizeOperands(*NewInstr, MDT);
if (NewDstReg)
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
/// was moved to VGPR. \returns true if succeeded.
bool moveFlatAddrToVGPR(MachineInstr &Inst) const;

/// Fix operands in Inst to fix 16bit SALU to VALU lowering.
void legalizeOperandsVALUt16(MachineInstr &Inst,
MachineRegisterInfo &MRI) const;

/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
/// in the \p WorkList to the VALU if necessary. If present, \p MDT is
Expand Down
20 changes: 20 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s

---
name: cmp_f16
body: |
bb.0.entry:
; GCN-LABEL: name: cmp_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
%0:vgpr_16 = IMPLICIT_DEF
%1:sreg_32 = IMPLICIT_DEF
%2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
%3:sreg_32 = COPY %2:vgpr_16
nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
%4:sreg_32_xm0_xexec = COPY $scc
%5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
...

---
name: cvt_hi_f32_f16
body: |
Expand Down
27 changes: 17 additions & 10 deletions llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=REAL16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=FAKE16 %s

---
name: fmac_f16
body: |
bb.0:
; GCN-LABEL: name: fmac_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
; REAL16-LABEL: name: fmac_f16
; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; REAL16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
; REAL16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
;
; FAKE16-LABEL: name: fmac_f16
; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
; FAKE16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = IMPLICIT_DEF
%2:sreg_32 = IMPLICIT_DEF
Expand Down
Loading