Skip to content

Commit 0325d3d

Browse files
committed
R600/SI: Try to use v_madak_f32
This is a code size optimization when the constant only has one use. llvm-svn: 230148
1 parent 657b1cb commit 0325d3d

File tree

3 files changed

+274
-0
lines changed

3 files changed

+274
-0
lines changed

llvm/lib/Target/R600/SIInstrInfo.cpp

+78
Original file line numberDiff line numberDiff line change
@@ -859,6 +859,84 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
859859
return RC != &AMDGPU::EXECRegRegClass;
860860
}
861861

862+
static void removeModOperands(MachineInstr &MI) {
863+
unsigned Opc = MI.getOpcode();
864+
int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
865+
AMDGPU::OpName::src0_modifiers);
866+
int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
867+
AMDGPU::OpName::src1_modifiers);
868+
int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
869+
AMDGPU::OpName::src2_modifiers);
870+
871+
MI.RemoveOperand(Src2ModIdx);
872+
MI.RemoveOperand(Src1ModIdx);
873+
MI.RemoveOperand(Src0ModIdx);
874+
}
875+
876+
bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
877+
unsigned Reg, MachineRegisterInfo *MRI) const {
878+
if (!MRI->hasOneNonDBGUse(Reg))
879+
return false;
880+
881+
unsigned Opc = UseMI->getOpcode();
882+
if (Opc == AMDGPU::V_MAD_F32) {
883+
// Don't fold if we are using source modifiers. The new VOP2 instructions
884+
// don't have them.
885+
if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
886+
hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
887+
hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
888+
return false;
889+
}
890+
891+
MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
892+
MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
893+
MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
894+
895+
// The VOP2 src0 can't be an SGPR since the constant bus use will be the
896+
// literal constant.
897+
if (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))
898+
return false;
899+
900+
// Added part is the constant: Use v_madak_f32
901+
if (Src2->isReg() && Src2->getReg() == Reg) {
902+
// Not allowed to use constant bus for another operand.
903+
// We can however allow an inline immediate as src0.
904+
if (!Src0->isImm() &&
905+
(Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
906+
return false;
907+
908+
if (!Src1->isReg() ||
909+
(Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
910+
return false;
911+
912+
const int64_t Imm = DefMI->getOperand(1).getImm();
913+
914+
// FIXME: This would be a lot easier if we could return a new instruction
915+
// instead of having to modify in place.
916+
917+
// Remove these first since they are at the end.
918+
UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
919+
AMDGPU::OpName::omod));
920+
UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32,
921+
AMDGPU::OpName::clamp));
922+
923+
Src2->ChangeToImmediate(Imm);
924+
925+
// These come before src2.
926+
removeModOperands(*UseMI);
927+
UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
928+
929+
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
930+
if (DeleteDef)
931+
DefMI->eraseFromParent();
932+
933+
return true;
934+
}
935+
}
936+
937+
return false;
938+
}
939+
862940
bool
863941
SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
864942
AliasAnalysis *AA) const {

llvm/lib/Target/R600/SIInstrInfo.h

+3
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,9 @@ class SIInstrInfo : public AMDGPUInstrInfo {
136136

137137
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
138138

139+
bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
140+
unsigned Reg, MachineRegisterInfo *MRI) const final;
141+
139142
bool isSALU(uint16_t Opcode) const {
140143
return get(Opcode).TSFlags & SIInstrFlags::SALU;
141144
}

llvm/test/CodeGen/R600/madak.ll

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
2+
; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
3+
4+
; FIXME: Enable VI
5+
6+
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
7+
declare float @llvm.fabs.f32(float) nounwind readnone
8+
9+
; GCN-LABEL: {{^}}madak_f32:
10+
; GCN: buffer_load_dword [[VA:v[0-9]+]]
11+
; GCN: buffer_load_dword [[VB:v[0-9]+]]
12+
; GCN: v_madak_f32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
13+
define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
14+
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
15+
%in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
16+
%in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
17+
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
18+
19+
%a = load float addrspace(1)* %in.a.gep, align 4
20+
%b = load float addrspace(1)* %in.b.gep, align 4
21+
22+
%mul = fmul float %a, %b
23+
%madak = fadd float %mul, 10.0
24+
store float %madak, float addrspace(1)* %out.gep, align 4
25+
ret void
26+
}
27+
28+
; Make sure this is only folded with one use. This is a code size
29+
; optimization and if we fold the immediate multiple times, we'll undo
30+
; it.
31+
32+
; GCN-LABEL: {{^}}madak_2_use_f32:
33+
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
34+
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
35+
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
36+
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
37+
; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
38+
; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]]
39+
; GCN: s_endpgm
40+
define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
41+
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
42+
43+
%in.gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
44+
%in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
45+
%in.gep.2 = getelementptr float addrspace(1)* %in.gep.0, i32 2
46+
47+
%out.gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
48+
%out.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
49+
50+
%a = load float addrspace(1)* %in.gep.0, align 4
51+
%b = load float addrspace(1)* %in.gep.1, align 4
52+
%c = load float addrspace(1)* %in.gep.2, align 4
53+
54+
%mul0 = fmul float %a, %b
55+
%mul1 = fmul float %a, %c
56+
%madak0 = fadd float %mul0, 10.0
57+
%madak1 = fadd float %mul1, 10.0
58+
59+
store float %madak0, float addrspace(1)* %out.gep.0, align 4
60+
store float %madak1, float addrspace(1)* %out.gep.1, align 4
61+
ret void
62+
}
63+
64+
; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
65+
; GCN: buffer_load_dword [[VA:v[0-9]+]]
66+
; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
67+
define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
68+
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
69+
%in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
70+
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
71+
72+
%a = load float addrspace(1)* %in.a.gep, align 4
73+
74+
%mul = fmul float 4.0, %a
75+
%madak = fadd float %mul, 10.0
76+
store float %madak, float addrspace(1)* %out.gep, align 4
77+
ret void
78+
}
79+
80+
; Make sure nothing weird happens with a value that is also allowed as
81+
; an inline immediate.
82+
83+
; GCN-LABEL: {{^}}madak_inline_imm_f32:
84+
; GCN: buffer_load_dword [[VA:v[0-9]+]]
85+
; GCN: buffer_load_dword [[VB:v[0-9]+]]
86+
; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
87+
define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
88+
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
89+
%in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
90+
%in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
91+
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
92+
93+
%a = load float addrspace(1)* %in.a.gep, align 4
94+
%b = load float addrspace(1)* %in.b.gep, align 4
95+
96+
%mul = fmul float %a, %b
97+
%madak = fadd float %mul, 4.0
98+
store float %madak, float addrspace(1)* %out.gep, align 4
99+
ret void
100+
}
101+
102+
; We can't use an SGPR when forming madak
103+
; GCN-LABEL: {{^}}s_v_madak_f32:
104+
; GCN: s_load_dword [[SB:s[0-9]+]]
105+
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
106+
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
107+
; GCN-NOT: v_madak_f32
108+
; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
109+
define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
110+
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
111+
%in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
112+
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
113+
114+
%a = load float addrspace(1)* %in.a.gep, align 4
115+
116+
%mul = fmul float %a, %b
117+
%madak = fadd float %mul, 10.0
118+
store float %madak, float addrspace(1)* %out.gep, align 4
119+
ret void
120+
}
121+
122+
; GCN-LABEL: @v_s_madak_f32
123+
; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
124+
; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
125+
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
126+
; GCN-NOT: v_madak_f32
127+
; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
128+
define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
129+
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
130+
%in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
131+
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
132+
133+
%b = load float addrspace(1)* %in.b.gep, align 4
134+
135+
%mul = fmul float %a, %b
136+
%madak = fadd float %mul, 10.0
137+
store float %madak, float addrspace(1)* %out.gep, align 4
138+
ret void
139+
}
140+
141+
; GCN-LABEL: {{^}}s_s_madak_f32:
142+
; GCN-NOT: v_madak_f32
143+
; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
144+
define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
145+
%mul = fmul float %a, %b
146+
%madak = fadd float %mul, 10.0
147+
store float %madak, float addrspace(1)* %out, align 4
148+
ret void
149+
}
150+
151+
; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
152+
; GCN: buffer_load_dword [[VA:v[0-9]+]]
153+
; GCN: buffer_load_dword [[VB:v[0-9]+]]
154+
; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
155+
; GCN: s_endpgm
156+
define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
157+
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
158+
%in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
159+
%in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
160+
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
161+
162+
%a = load float addrspace(1)* %in.a.gep, align 4
163+
%b = load float addrspace(1)* %in.b.gep, align 4
164+
165+
%a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
166+
167+
%mul = fmul float %a.fabs, %b
168+
%madak = fadd float %mul, 10.0
169+
store float %madak, float addrspace(1)* %out.gep, align 4
170+
ret void
171+
}
172+
173+
; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
174+
; GCN: buffer_load_dword [[VA:v[0-9]+]]
175+
; GCN: buffer_load_dword [[VB:v[0-9]+]]
176+
; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
177+
; GCN: s_endpgm
178+
define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
179+
%tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
180+
%in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
181+
%in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
182+
%out.gep = getelementptr float addrspace(1)* %out, i32 %tid
183+
184+
%a = load float addrspace(1)* %in.a.gep, align 4
185+
%b = load float addrspace(1)* %in.b.gep, align 4
186+
187+
%b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
188+
189+
%mul = fmul float %a, %b.fabs
190+
%madak = fadd float %mul, 10.0
191+
store float %madak, float addrspace(1)* %out.gep, align 4
192+
ret void
193+
}

0 commit comments

Comments
 (0)