@@ -1290,8 +1290,8 @@ class LoopVectorizationCostModel {
1290
1290
if (VF.isScalar () || Uniforms.contains (VF))
1291
1291
return ;
1292
1292
setCostBasedWideningDecision (VF);
1293
- setVectorizedCallDecision(VF);
1294
1293
collectLoopUniforms (VF);
1294
+ setVectorizedCallDecision (VF);
1295
1295
collectLoopScalars (VF);
1296
1296
}
1297
1297
@@ -6194,6 +6194,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6194
6194
assert (!VF.isScalar () &&
6195
6195
" Trying to set a vectorization decision for a scalar VF" );
6196
6196
6197
+ auto ForcedScalar = ForcedScalars.find (VF);
6197
6198
for (BasicBlock *BB : TheLoop->blocks ()) {
6198
6199
// For each instruction in the old loop.
6199
6200
for (Instruction &I : *BB) {
@@ -6206,14 +6207,37 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6206
6207
InstructionCost VectorCost = InstructionCost::getInvalid ();
6207
6208
InstructionCost IntrinsicCost = InstructionCost::getInvalid ();
6208
6209
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6209
-
6210
6210
Function *ScalarFunc = CI->getCalledFunction ();
6211
6211
Type *ScalarRetTy = CI->getType ();
6212
6212
SmallVector<Type *, 4 > Tys, ScalarTys;
6213
- bool MaskRequired = Legal->isMaskRequired(CI);
6214
6213
for (auto &ArgOp : CI->args ())
6215
6214
ScalarTys.push_back (ArgOp->getType ());
6216
6215
6216
+ // Estimate cost of scalarized vector call. The source operands are
6217
+ // assumed to be vectors, so we need to extract individual elements from
6218
+ // there, execute VF scalar calls, and then gather the result into the
6219
+ // vector return value.
6220
+ InstructionCost ScalarCallCost =
6221
+ TTI.getCallInstrCost (ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6222
+
6223
+ // Compute costs of unpacking argument values for the scalar calls and
6224
+ // packing the return values to a vector.
6225
+ InstructionCost ScalarizationCost =
6226
+ getScalarizationOverhead (CI, VF, CostKind);
6227
+
6228
+ ScalarCost = ScalarCallCost * VF.getKnownMinValue () + ScalarizationCost;
6229
+ // Honor ForcedScalars decision.
6230
+ // TODO: For calls, it might still be more profitable to widen. Use
6231
+ // VPlan-based cost model to compare different options.
6232
+ if (VF.isVector () && ForcedScalar != ForcedScalars.end () &&
6233
+ ForcedScalar->second .contains (CI)) {
6234
+ setCallWideningDecision (CI, VF, CM_Scalarize, nullptr ,
6235
+ Intrinsic::not_intrinsic, std::nullopt,
6236
+ ScalarCost);
6237
+ continue ;
6238
+ }
6239
+
6240
+ bool MaskRequired = Legal->isMaskRequired (CI);
6217
6241
// Compute corresponding vector type for return value and arguments.
6218
6242
Type *RetTy = ToVectorTy (ScalarRetTy, VF);
6219
6243
for (Type *ScalarTy : ScalarTys)
@@ -6229,20 +6253,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6229
6253
continue ;
6230
6254
}
6231
6255
6232
- // Estimate cost of scalarized vector call. The source operands are
6233
- // assumed to be vectors, so we need to extract individual elements from
6234
- // there, execute VF scalar calls, and then gather the result into the
6235
- // vector return value.
6236
- InstructionCost ScalarCallCost =
6237
- TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6238
-
6239
- // Compute costs of unpacking argument values for the scalar calls and
6240
- // packing the return values to a vector.
6241
- InstructionCost ScalarizationCost =
6242
- getScalarizationOverhead(CI, VF, CostKind);
6243
-
6244
- ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6245
-
6246
6256
// Find the cost of vectorizing the call, if we can find a suitable
6247
6257
// vector variant of the function.
6248
6258
bool UsesMask = false ;
0 commit comments