You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[AArch64] Add invalid 1 x vscale costs for reductions and reduction-operations.
The code-generator is currently not able to handle scalable vectors of <vscale
x 1 x eltty>. The usual "fix" for this until it is supported is to mark the
costs of loads/stores with an invalid cost, preventing the vectorizer from
vectorizing at those factors. But on rare occasions loops don't contain
load/stores, only reductions.
So whilst this is still unsupported return an invalid cost to avoid selecting
vscale x 1 VFs. The cost of a reduction is not currently used by the
vectorizer so this adds the cost to the add/mul/and/or/xor or min/max that
should feed the reduction. This change will be removed when code-generation
for these types is sufficiently reliable.
Fixes#99760
Copy file name to clipboardExpand all lines: llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+21Lines changed: 21 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -116,16 +116,20 @@ declare <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float>, i64
116
116
117
117
definevoid@reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale x 4 x float> %v2, <vscale x 4 x double> %v3) {
118
118
; CHECK-LABEL: 'reductions'
119
+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
119
120
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
120
121
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
122
+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
121
123
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
122
124
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
125
+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
123
126
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
124
127
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
125
128
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
126
129
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
127
130
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
128
131
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
132
+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
129
133
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
130
134
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
131
135
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
@@ -134,25 +138,32 @@ define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale
134
138
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
135
139
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
136
140
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
141
+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
137
142
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
138
143
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
144
+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
139
145
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
140
146
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
147
+
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
141
148
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
142
149
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
143
150
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
144
151
;
145
152
; TYPE_BASED_ONLY-LABEL: 'reductions'
153
+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
146
154
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
147
155
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
156
+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
148
157
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
149
158
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
159
+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
150
160
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
151
161
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
152
162
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
153
163
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
154
164
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
155
165
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
166
+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
156
167
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
157
168
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
158
169
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
@@ -161,24 +172,31 @@ define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale
161
172
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
162
173
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
163
174
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
175
+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
164
176
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
165
177
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
178
+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
166
179
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
167
180
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
181
+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
168
182
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
169
183
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
170
184
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
171
185
;
186
+
%add_nxv1i32 = calli32@llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
172
187
%add_nxv4i32 = calli32@llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
173
188
%add_nxv4i64 = calli64@llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
189
+
%mul_nxv1i32 = calli32@llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
174
190
%mul_nxv4i32 = calli32@llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
175
191
%mul_nxv4i64 = calli64@llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
192
+
%and_nxv1i32 = calli32@llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
176
193
%and_nxv4i32 = calli32@llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
177
194
%and_nxv4i64 = calli64@llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
178
195
%or_nxv4i32 = calli32@llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
179
196
%or_nxv4i64 = calli64@llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
180
197
%xor_nxv4i32 = calli32@llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
181
198
%xor_nxv4i64 = calli64@llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
199
+
%umin_nxv1i64 = calli64@llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
182
200
%umin_nxv4i32 = calli32@llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
183
201
%umin_nxv4i64 = calli64@llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
184
202
%smin_nxv4i32 = calli32@llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
@@ -188,10 +206,13 @@ define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale
188
206
%smax_nxv4i32 = calli32@llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
189
207
%smax_nxv4i64 = calli64@llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
190
208
209
+
%fadd_nxv1f32 = call fast float@llvm.vector.reduce.fadd.nxv1f32(float0.0, <vscale x 1 x float> undef)
191
210
%fadd_nxv4f32 = call fast float@llvm.vector.reduce.fadd.nxv4f32(float0.0, <vscale x 4 x float> %v2)
192
211
%fadd_nxv4f64 = call fast double@llvm.vector.reduce.fadd.nxv4f64(double0.0, <vscale x 4 x double> %v3)
212
+
%fmin_nxv1f32 = call fast float@llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
193
213
%fmin_nxv4f32 = call fast float@llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
194
214
%fmin_nxv4f64 = call fast double@llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
215
+
%fmax_nxv1f32 = call fast float@llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
195
216
%fmax_nxv4f32 = call fast float@llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
196
217
%fmax_nxv4f64 = call fast double@llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
0 commit comments