@@ -2390,7 +2390,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2390
2390
VPReplicateRecipe *RepRecipe,
2391
2391
const VPLane &Lane,
2392
2392
VPTransformState &State) {
2393
- assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2393
+ assert((!Instr->getType()->isAggregateType() ||
2394
+ canVectorizeTy(Instr->getType())) &&
2395
+ "Expected vectorizable or non-aggregate type.");
2394
2396
2395
2397
// Does this instruction return a value ?
2396
2398
bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2900,10 +2902,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2900
2902
return ScalarCallCost;
2901
2903
}
2902
2904
2903
- static Type *maybeVectorizeType(Type *Elt , ElementCount VF) {
2904
- if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy() ))
2905
- return Elt ;
2906
- return VectorType::get(Elt , VF);
2905
+ static Type *maybeVectorizeType(Type *Ty , ElementCount VF) {
2906
+ if (VF.isScalar() || !canVectorizeTy(Ty ))
2907
+ return Ty ;
2908
+ return toVectorizedTy(Ty , VF);
2907
2909
}
2908
2910
2909
2911
InstructionCost
@@ -3650,13 +3652,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3650
3652
}
3651
3653
}
3652
3654
3653
- // ExtractValue instructions must be uniform, because the operands are
3654
- // known to be loop-invariant.
3655
3655
if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3656
- assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3657
- "Expected aggregate value to be loop invariant");
3658
- AddToWorklistIfAllowed(EVI);
3659
- continue;
3656
+ if (IsOutOfScope(EVI->getAggregateOperand())) {
3657
+ AddToWorklistIfAllowed(EVI);
3658
+ continue;
3659
+ }
3660
+ // Only ExtractValue instructions where the aggregate value comes from a
3661
+ // call are allowed to be non-uniform.
3662
+ assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3663
+ "Expected aggregate value to be call return value");
3660
3664
}
3661
3665
3662
3666
// If there's no pointer operand, there's nothing to do.
@@ -4526,8 +4530,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4526
4530
llvm_unreachable("unhandled recipe");
4527
4531
}
4528
4532
4529
- auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4530
- Type *VectorTy = toVectorTy(ScalarTy, VF);
4533
+ auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4531
4534
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4532
4535
if (!NumLegalParts)
4533
4536
return false;
@@ -4539,7 +4542,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4539
4542
// explicitly ask TTI about the register class uses for each part.
4540
4543
return NumLegalParts <= VF.getKnownMinValue();
4541
4544
}
4542
- // Two or more parts that share a register - are vectorized.
4545
+ // Two or more elements that share a register - are vectorized.
4543
4546
return NumLegalParts < VF.getKnownMinValue();
4544
4547
};
4545
4548
@@ -4558,7 +4561,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4558
4561
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4559
4562
if (!Visited.insert({ScalarTy}).second)
4560
4563
continue;
4561
- if (WillWiden(ScalarTy))
4564
+ Type *WideTy = toVectorizedTy(ScalarTy, VF);
4565
+ if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
4562
4566
return true;
4563
4567
}
4564
4568
}
@@ -5515,10 +5519,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5515
5519
// Compute the scalarization overhead of needed insertelement instructions
5516
5520
// and phi nodes.
5517
5521
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5518
- ScalarCost += TTI.getScalarizationOverhead(
5519
- cast<VectorType>(toVectorTy(I->getType(), VF)),
5520
- APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5521
- /*Extract*/ false, CostKind);
5522
+ Type *WideTy = toVectorizedTy(I->getType(), VF);
5523
+ for (Type *VectorTy : getContainedTypes(WideTy)) {
5524
+ ScalarCost += TTI.getScalarizationOverhead(
5525
+ cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5526
+ /*Insert=*/true,
5527
+ /*Extract=*/false, CostKind);
5528
+ }
5522
5529
ScalarCost +=
5523
5530
VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5524
5531
}
@@ -5529,15 +5536,18 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5529
5536
// overhead.
5530
5537
for (Use &U : I->operands())
5531
5538
if (auto *J = dyn_cast<Instruction>(U.get())) {
5532
- assert(VectorType::isValidElementType (J->getType()) &&
5539
+ assert(canVectorizeTy (J->getType()) &&
5533
5540
"Instruction has non-scalar type");
5534
5541
if (CanBeScalarized(J))
5535
5542
Worklist.push_back(J);
5536
5543
else if (needsExtract(J, VF)) {
5537
- ScalarCost += TTI.getScalarizationOverhead(
5538
- cast<VectorType>(toVectorTy(J->getType(), VF)),
5539
- APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5540
- /*Extract*/ true, CostKind);
5544
+ Type *WideTy = toVectorizedTy(J->getType(), VF);
5545
+ for (Type *VectorTy : getContainedTypes(WideTy)) {
5546
+ ScalarCost += TTI.getScalarizationOverhead(
5547
+ cast<VectorType>(VectorTy),
5548
+ APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5549
+ /*Extract*/ true, CostKind);
5550
+ }
5541
5551
}
5542
5552
}
5543
5553
@@ -6016,13 +6026,17 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6016
6026
return 0;
6017
6027
6018
6028
InstructionCost Cost = 0;
6019
- Type *RetTy = toVectorTy (I->getType(), VF);
6029
+ Type *RetTy = toVectorizedTy (I->getType(), VF);
6020
6030
if (!RetTy->isVoidTy() &&
6021
- (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6022
- Cost += TTI.getScalarizationOverhead(
6023
- cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6024
- /*Insert*/ true,
6025
- /*Extract*/ false, CostKind);
6031
+ (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
6032
+
6033
+ for (Type *VectorTy : getContainedTypes(RetTy)) {
6034
+ Cost += TTI.getScalarizationOverhead(
6035
+ cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
6036
+ /*Insert=*/true,
6037
+ /*Extract=*/false, CostKind);
6038
+ }
6039
+ }
6026
6040
6027
6041
// Some targets keep addresses scalar.
6028
6042
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6280,9 +6294,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6280
6294
6281
6295
bool MaskRequired = Legal->isMaskRequired(CI);
6282
6296
// Compute corresponding vector type for return value and arguments.
6283
- Type *RetTy = toVectorTy (ScalarRetTy, VF);
6297
+ Type *RetTy = toVectorizedTy (ScalarRetTy, VF);
6284
6298
for (Type *ScalarTy : ScalarTys)
6285
- Tys.push_back(toVectorTy (ScalarTy, VF));
6299
+ Tys.push_back(toVectorizedTy (ScalarTy, VF));
6286
6300
6287
6301
// An in-loop reduction using an fmuladd intrinsic is a special case;
6288
6302
// we don't want the normal cost for that intrinsic.
@@ -6459,7 +6473,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6459
6473
HasSingleCopyAfterVectorization(I, VF));
6460
6474
VectorTy = RetTy;
6461
6475
} else
6462
- VectorTy = toVectorTy (RetTy, VF);
6476
+ VectorTy = toVectorizedTy (RetTy, VF);
6463
6477
6464
6478
if (VF.isVector() && VectorTy->isVectorTy() &&
6465
6479
!TTI.getNumberOfParts(VectorTy))
@@ -8601,7 +8615,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8601
8615
case Instruction::Shl:
8602
8616
case Instruction::Sub:
8603
8617
case Instruction::Xor:
8604
- case Instruction::Freeze:
8618
+ case Instruction::Freeze: {
8605
8619
SmallVector<VPValue *> NewOps(Operands);
8606
8620
if (Instruction::isBinaryOp(I->getOpcode())) {
8607
8621
// The legacy cost model uses SCEV to check if some of the operands are
@@ -8626,6 +8640,16 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8626
8640
NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8627
8641
}
8628
8642
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8643
+ }
8644
+ case Instruction::ExtractValue: {
8645
+ SmallVector<VPValue *> NewOps(Operands);
8646
+ Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
8647
+ auto *EVI = cast<ExtractValueInst>(I);
8648
+ assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
8649
+ unsigned Idx = EVI->getIndices()[0];
8650
+ NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
8651
+ return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8652
+ }
8629
8653
};
8630
8654
}
8631
8655
@@ -9928,7 +9952,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
9928
9952
VectorType::get(UI->getType(), State.VF));
9929
9953
State.set(this, Poison);
9930
9954
}
9931
- State.packScalarIntoVectorValue (this, *State.Lane);
9955
+ State.packScalarIntoVectorizedValue (this, *State.Lane);
9932
9956
}
9933
9957
return;
9934
9958
}
@@ -10445,13 +10469,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10445
10469
return false;
10446
10470
}
10447
10471
10448
- if (LVL.hasStructVectorCall()) {
10449
- reportVectorizationFailure("Auto-vectorization of calls that return struct "
10450
- "types is not yet supported",
10451
- "StructCallVectorizationUnsupported", ORE, L);
10452
- return false;
10453
- }
10454
-
10455
10472
// Entrance to the VPlan-native vectorization path. Outer loops are processed
10456
10473
// here. They may require CFG and instruction level transformations before
10457
10474
// even evaluating whether vectorization is profitable. Since we cannot modify
0 commit comments