From 45a08aed9713e48dd4cf0b5b34c8d19de6981e65 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Sun, 12 May 2024 17:03:48 -0700 Subject: [PATCH 1/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?= =?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.4 [skip ci] --- bolt/lib/Core/BinaryEmitter.cpp | 1 + bolt/lib/Passes/BinaryPasses.cpp | 3 + bolt/lib/Passes/ValidateMemRefs.cpp | 11 +- bolt/lib/Profile/DataAggregator.cpp | 85 ++- bolt/lib/Profile/YAMLProfileReader.cpp | 30 +- .../blarge_new_bat_branchentry.preagg.txt | 1 + .../Inputs/blarge_new_bat_order.preagg.txt | 2 + bolt/test/X86/bb-with-two-tail-calls.s | 6 +- .../X86/bolt-address-translation-yaml.test | 26 + bolt/test/X86/profile-passthrough-block.test | 66 ++ bolt/test/runtime/bolt-reserved.cpp | 4 +- .../clang/Basic/DiagnosticSemaKinds.td | 6 - clang/include/clang/Basic/IdentifierTable.h | 2 +- clang/include/clang/Basic/SourceManager.h | 6 +- clang/include/clang/Driver/Options.td | 8 +- clang/lib/ARCMigrate/ObjCMT.cpp | 2 +- clang/lib/AST/Interp/Descriptor.cpp | 8 - clang/lib/AST/Interp/Descriptor.h | 6 +- clang/lib/AST/Interp/Interp.h | 36 +- clang/lib/AST/Interp/Pointer.h | 32 +- clang/lib/AST/Interp/Program.cpp | 21 +- clang/lib/AST/PrintfFormatString.cpp | 8 +- clang/lib/ASTMatchers/Dynamic/Marshallers.cpp | 4 +- clang/lib/Basic/Builtins.cpp | 2 +- clang/lib/Basic/Diagnostic.cpp | 3 +- clang/lib/Basic/LangOptions.cpp | 2 +- clang/lib/Basic/Targets/ARM.cpp | 5 +- clang/lib/Basic/Targets/PPC.h | 2 +- clang/lib/Basic/Targets/SystemZ.h | 2 +- clang/lib/Basic/Targets/X86.cpp | 6 + clang/lib/Basic/Targets/X86.h | 7 +- clang/lib/CodeGen/CGRecordLayoutBuilder.cpp | 21 +- clang/lib/CodeGen/CodeGenFunction.cpp | 7 +- clang/lib/Driver/ToolChains/Arch/X86.cpp | 3 +- clang/lib/Driver/ToolChains/Gnu.cpp | 29 + clang/lib/Format/ContinuationIndenter.cpp | 2 +- clang/lib/Format/Format.cpp | 2 +- clang/lib/Format/QualifierAlignmentFixer.cpp | 18 +- clang/lib/Format/QualifierAlignmentFixer.h | 4 +- clang/lib/Frontend/CompilerInvocation.cpp | 2 +- .../Frontend/ModuleDependencyCollector.cpp | 2 +- clang/lib/Lex/PPDirectives.cpp | 4 +- clang/lib/Parse/ParseDecl.cpp | 2 +- clang/lib/Parse/ParseDeclCXX.cpp | 4 +- clang/lib/Parse/ParseHLSL.cpp | 2 +- clang/lib/Parse/ParseOpenMP.cpp | 4 +- clang/lib/Sema/SemaCodeComplete.cpp | 9 +- clang/lib/Sema/SemaDecl.cpp | 8 +- clang/lib/Sema/SemaDeclCXX.cpp | 4 +- clang/lib/Sema/SemaExpr.cpp | 31 +- clang/lib/Sema/SemaInit.cpp | 19 +- clang/lib/Sema/SemaStmtAsm.cpp | 2 +- clang/lib/Sema/SemaTemplate.cpp | 6 +- .../Checkers/GenericTaintChecker.cpp | 7 +- .../Checkers/LLVMConventionsChecker.cpp | 2 +- .../Checkers/LocalizationChecker.cpp | 2 +- .../StaticAnalyzer/Checkers/MallocChecker.cpp | 4 +- .../Checkers/ObjCContainersASTChecker.cpp | 4 +- .../Checkers/ObjCContainersChecker.cpp | 6 +- .../StaticAnalyzer/Checkers/StreamChecker.cpp | 2 +- .../Checkers/WebKit/PtrTypesSemantics.cpp | 9 +- .../StaticAnalyzer/Core/CheckerContext.cpp | 2 +- clang/lib/Tooling/Tooling.cpp | 2 +- clang/test/AST/Interp/builtin-align-cxx.cpp | 15 +- clang/test/AST/Interp/c.c | 3 + clang/test/AST/ast-dump-default-init-json.cpp | 6 +- clang/test/AST/ast-dump-default-init.cpp | 2 +- .../Checkers/WebKit/uncounted-obj-arg.cpp | 38 + .../Analysis/lifetime-extended-regions.cpp | 9 +- clang/test/CXX/drs/cwg16xx.cpp | 2 - clang/test/CXX/drs/cwg18xx.cpp | 66 +- clang/test/CXX/special/class.temporary/p6.cpp | 34 + clang/test/Driver/x86-target-features.c | 2 + .../Modules/implicit-module-no-timestamp.cpp | 1 - clang/test/Preprocessor/x86_target_features.c | 2 + clang/test/SemaCXX/constexpr-default-arg.cpp | 4 +- clang/test/SemaCXX/eval-crashes.cpp | 6 +- clang/test/SemaTemplate/ctad.cpp | 15 + clang/tools/diagtool/ShowEnabledWarnings.cpp | 4 +- clang/tools/diagtool/TreeView.cpp | 2 +- clang/unittests/AST/Interp/Descriptor.cpp | 2 +- clang/unittests/CodeGen/IRMatchers.h | 2 +- clang/unittests/Format/QualifierFixerTest.cpp | 94 ++- clang/www/cxx_dr_status.html | 4 +- cross-project-tests/lit.cfg.py | 14 +- cross-project-tests/lit.site.cfg.py.in | 4 + libcxx/include/CMakeLists.txt | 1 + libcxx/include/__algorithm/mismatch.h | 52 +- libcxx/include/__algorithm/simd_utils.h | 34 +- libcxx/include/__iterator/aliasing_iterator.h | 127 ++++ .../__type_traits/is_equality_comparable.h | 2 + libcxx/include/module.modulemap | 2 + .../iterators/aliasing_iterator.pass.cpp | 45 ++ .../mismatch/mismatch.pass.cpp | 47 +- lldb/test/API/lit.cfg.py | 5 + lldb/test/API/lit.site.cfg.py.in | 8 + lldb/test/API/macosx/rosetta/TestRosetta.py | 1 + .../API/macosx/universal64/TestUniversal64.py | 2 + lldb/test/Shell/helper/toolchain.py | 5 + lldb/test/Shell/lit.site.cfg.py.in | 9 + llvm/CMakeLists.txt | 4 + llvm/include/llvm/BinaryFormat/DXContainer.h | 8 +- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 8 +- llvm/include/llvm/MC/MCFragment.h | 22 + llvm/include/llvm/MC/MCObjectStreamer.h | 2 + llvm/include/llvm/MC/MCStreamer.h | 6 + .../llvm/TargetParser/X86TargetParser.def | 1 + llvm/lib/MC/MCAssembler.cpp | 118 +++- llvm/lib/MC/MCDXContainerWriter.cpp | 8 +- llvm/lib/MC/MCFragment.cpp | 12 + llvm/lib/MC/MCObjectStreamer.cpp | 5 + llvm/lib/MC/MCStreamer.cpp | 2 + llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 4 +- .../Target/AMDGPU/AMDGPUHSAMetadataStreamer.h | 4 +- llvm/lib/Target/AMDGPU/SIProgramInfo.h | 3 +- .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 24 + llvm/lib/Target/X86/X86.td | 2 + llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 24 +- .../lib/Target/X86/X86TargetTransformInfo.cpp | 65 +- llvm/lib/Target/X86/X86TargetTransformInfo.h | 3 - llvm/lib/TargetParser/X86TargetParser.cpp | 1 + llvm/lib/Transforms/Scalar/JumpThreading.cpp | 6 +- llvm/lib/Transforms/Scalar/LICM.cpp | 2 + .../Transforms/Vectorize/VectorCombine.cpp | 33 +- .../Analysis/CostModel/AArch64/masked_ldst.ll | 36 +- .../CostModel/AArch64/mem-op-cost-model.ll | 32 +- .../Analysis/CostModel/AArch64/sve-gather.ll | 6 +- .../CostModel/AArch64/sve-intrinsics.ll | 8 +- .../Analysis/CostModel/RISCV/masked_ldst.ll | 8 +- .../CostModel/X86/intrinsic-cost-kinds.ll | 4 +- .../X86/masked-gather-i32-with-i8-index.ll | 4 +- .../X86/masked-gather-i64-with-i8-index.ll | 4 +- .../X86/masked-intrinsic-cost-inseltpoison.ll | 650 +++++++++--------- .../CostModel/X86/masked-intrinsic-cost.ll | 650 +++++++++--------- llvm/test/CodeGen/X86/cmp.ll | 47 +- llvm/test/MC/X86/directive-avoid_end_align.s | 208 ++++++ .../JumpThreading/guard-split-debuginfo.ll | 30 +- .../preserving-debugloc-bitcast.ll | 46 ++ .../preserving-debugloc-fold-select.ll | 76 ++ .../LICM/debugloc-preserve-fmul-drop-fdiv.ll | 66 ++ .../LoopVectorize/AArch64/masked-op-cost.ll | 4 +- .../AArch64/shuffletoidentity.ll | 62 +- llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 4 +- .../MC/AMDGPU/SIProgramInfoMCExprs.cpp | 12 +- llvm/unittests/Object/DXContainerTest.cpp | 43 ++ mlir/include/mlir-c/Dialect/IRDL.h | 29 + .../include/mlir/Analysis/DataFlowFramework.h | 11 + mlir/lib/CAPI/Dialect/CMakeLists.txt | 9 + mlir/lib/CAPI/Dialect/IRDL.cpp | 18 + .../Conversion/TosaToTensor/TosaToTensor.cpp | 11 +- .../Transforms/IntRangeOptimizations.cpp | 25 +- mlir/test/CAPI/CMakeLists.txt | 7 + mlir/test/CAPI/irdl.c | 58 ++ mlir/test/CMakeLists.txt | 1 + .../TosaToTensor/tosa-to-tensor.mlir | 15 + mlir/test/lit.cfg.py | 1 + 156 files changed, 2542 insertions(+), 1309 deletions(-) create mode 100644 bolt/test/X86/Inputs/blarge_new_bat_branchentry.preagg.txt create mode 100644 bolt/test/X86/Inputs/blarge_new_bat_order.preagg.txt create mode 100644 bolt/test/X86/profile-passthrough-block.test create mode 100644 libcxx/include/__iterator/aliasing_iterator.h create mode 100644 libcxx/test/libcxx/iterators/aliasing_iterator.pass.cpp create mode 100644 llvm/test/MC/X86/directive-avoid_end_align.s create mode 100644 llvm/test/Transforms/JumpThreading/preserving-debugloc-bitcast.ll create mode 100644 llvm/test/Transforms/JumpThreading/preserving-debugloc-fold-select.ll create mode 100644 llvm/test/Transforms/LICM/debugloc-preserve-fmul-drop-fdiv.ll create mode 100644 mlir/include/mlir-c/Dialect/IRDL.h create mode 100644 mlir/lib/CAPI/Dialect/IRDL.cpp create mode 100644 mlir/test/CAPI/irdl.c diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index 6f86ddc774544..c3b3dc2e7005b 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -485,6 +485,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, // This assumes the second instruction in the macro-op pair will get // assigned to its own MCRelaxableFragment. Since all JCC instructions // are relaxable, we should be safe. + Streamer.emitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64, *BC.STI); } if (!EmitCodeOnly) { diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index df6dbcddeed56..867f977cebca7 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -715,6 +715,9 @@ static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) { Pred->removeSuccessor(&BB); Pred->eraseInstruction(Pred->findInstruction(Branch)); Pred->addTailCallInstruction(SuccSym); + MCInst *TailCall = Pred->getLastNonPseudoInstr(); + assert(TailCall); + MIB->setOffset(*TailCall, BB.getOffset()); } else { return false; } diff --git a/bolt/lib/Passes/ValidateMemRefs.cpp b/bolt/lib/Passes/ValidateMemRefs.cpp index f29a97c43f497..1d2c230fa7106 100644 --- a/bolt/lib/Passes/ValidateMemRefs.cpp +++ b/bolt/lib/Passes/ValidateMemRefs.cpp @@ -29,8 +29,7 @@ bool ValidateMemRefs::checkAndFixJTReference(BinaryFunction &BF, MCInst &Inst, if (!BD) return false; - const uint64_t TargetAddress = BD->getAddress() + Offset; - JumpTable *JT = BC.getJumpTableContainingAddress(TargetAddress); + JumpTable *JT = BC.getJumpTableContainingAddress(BD->getAddress()); if (!JT) return false; @@ -41,10 +40,10 @@ bool ValidateMemRefs::checkAndFixJTReference(BinaryFunction &BF, MCInst &Inst, // Accessing a jump table in another function. This is not a // legitimate jump table access, we need to replace the reference to // the jump table label with a regular rodata reference. Get a - // non-JT reference by fetching the symbol 1 byte before the JT - // label. - MCSymbol *NewSym = BC.getOrCreateGlobalSymbol(TargetAddress - 1, "DATAat"); - BC.MIB->setOperandToSymbolRef(Inst, OperandNum, NewSym, 1, &*BC.Ctx, 0); + // non-JT reference by fetching the symbol 1 byte before the JT label. + MCSymbol *NewSym = BC.getOrCreateGlobalSymbol(BD->getAddress() - 1, "DATAat"); + BC.MIB->setOperandToSymbolRef(Inst, OperandNum, NewSym, Offset + 1, &*BC.Ctx, + 0); LLVM_DEBUG(dbgs() << "BOLT-DEBUG: replaced reference @" << BF.getPrintName() << " from " << BD->getName() << " to " << NewSym->getName() << " + 1\n"); diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index f832f958def08..b76b355d6adc0 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" @@ -2354,54 +2355,48 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, for (auto BI = BlockMap.begin(), BE = BlockMap.end(); BI != BE; ++BI) YamlBF.Blocks[BI->second.getBBIndex()].Hash = BI->second.getBBHash(); - auto getSuccessorInfo = [&](uint32_t SuccOffset, unsigned SuccDataIdx) { - const llvm::bolt::BranchInfo &BI = Branches.Data.at(SuccDataIdx); - yaml::bolt::SuccessorInfo SI; - SI.Index = BlockMap.getBBIndex(SuccOffset); - SI.Count = BI.Branches; - SI.Mispreds = BI.Mispreds; - return SI; - }; - - auto getCallSiteInfo = [&](Location CallToLoc, unsigned CallToIdx, - uint32_t Offset) { - const llvm::bolt::BranchInfo &BI = Branches.Data.at(CallToIdx); - yaml::bolt::CallSiteInfo CSI; - CSI.DestId = 0; // designated for unknown functions - CSI.EntryDiscriminator = 0; - CSI.Count = BI.Branches; - CSI.Mispreds = BI.Mispreds; - CSI.Offset = Offset; - if (BinaryData *BD = BC.getBinaryDataByName(CallToLoc.Name)) - YAMLProfileWriter::setCSIDestination(BC, CSI, BD->getSymbol(), BAT, - CallToLoc.Offset); - return CSI; + // Lookup containing basic block offset and index + auto getBlock = [&BlockMap](uint32_t Offset) { + auto BlockIt = BlockMap.upper_bound(Offset); + if (LLVM_UNLIKELY(BlockIt == BlockMap.begin())) { + errs() << "BOLT-ERROR: Invalid BAT section"; + exit(1); + } + --BlockIt; + return std::pair(BlockIt->first, BlockIt->second.getBBIndex()); }; - for (const auto &[FromOffset, SuccKV] : Branches.IntraIndex) { - if (!BlockMap.isInputBlock(FromOffset)) - continue; - const unsigned Index = BlockMap.getBBIndex(FromOffset); - yaml::bolt::BinaryBasicBlockProfile &YamlBB = YamlBF.Blocks[Index]; - for (const auto &[SuccOffset, SuccDataIdx] : SuccKV) - if (BlockMap.isInputBlock(SuccOffset)) - YamlBB.Successors.emplace_back( - getSuccessorInfo(SuccOffset, SuccDataIdx)); + for (const llvm::bolt::BranchInfo &BI : Branches.Data) { + using namespace yaml::bolt; + const auto &[BlockOffset, BlockIndex] = getBlock(BI.From.Offset); + BinaryBasicBlockProfile &YamlBB = YamlBF.Blocks[BlockIndex]; + if (BI.To.IsSymbol && BI.To.Name == BI.From.Name && BI.To.Offset != 0) { + // Internal branch + const unsigned SuccIndex = getBlock(BI.To.Offset).second; + auto &SI = YamlBB.Successors.emplace_back(SuccessorInfo{SuccIndex}); + SI.Count = BI.Branches; + SI.Mispreds = BI.Mispreds; + } else { + // Call + const uint32_t Offset = BI.From.Offset - BlockOffset; + auto &CSI = YamlBB.CallSites.emplace_back(CallSiteInfo{Offset}); + CSI.Count = BI.Branches; + CSI.Mispreds = BI.Mispreds; + if (const BinaryData *BD = BC.getBinaryDataByName(BI.To.Name)) + YAMLProfileWriter::setCSIDestination(BC, CSI, BD->getSymbol(), BAT, + BI.To.Offset); + } } - for (const auto &[FromOffset, CallTo] : Branches.InterIndex) { - auto BlockIt = BlockMap.upper_bound(FromOffset); - --BlockIt; - const unsigned BlockOffset = BlockIt->first; - const unsigned BlockIndex = BlockIt->second.getBBIndex(); - yaml::bolt::BinaryBasicBlockProfile &YamlBB = YamlBF.Blocks[BlockIndex]; - const uint32_t Offset = FromOffset - BlockOffset; - for (const auto &[CallToLoc, CallToIdx] : CallTo) - YamlBB.CallSites.emplace_back( - getCallSiteInfo(CallToLoc, CallToIdx, Offset)); - llvm::sort(YamlBB.CallSites, [](yaml::bolt::CallSiteInfo &A, - yaml::bolt::CallSiteInfo &B) { - return A.Offset < B.Offset; - }); + // Set entry counts, similar to DataReader::readProfile. + for (const llvm::bolt::BranchInfo &BI : Branches.EntryData) { + if (!BlockMap.isInputBlock(BI.To.Offset)) { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: Unexpected EntryData in " << FuncName + << " at 0x" << Twine::utohexstr(BI.To.Offset) << '\n'; + continue; + } + const unsigned BlockIndex = BlockMap.getBBIndex(BI.To.Offset); + YamlBF.Blocks[BlockIndex].ExecCount += BI.Branches; } // Drop blocks without a hash, won't be useful for stale matching. llvm::erase_if(YamlBF.Blocks, diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index e4673f6e3c301..156a0efd89cd9 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -218,17 +218,29 @@ bool YAMLProfileReader::parseFunctionProfile( continue; } - BinaryBasicBlock &SuccessorBB = *Order[YamlSI.Index]; - if (!BB.getSuccessor(SuccessorBB.getLabel())) { - if (opts::Verbosity >= 1) - errs() << "BOLT-WARNING: no successor for block " << BB.getName() - << " that matches index " << YamlSI.Index << " or block " - << SuccessorBB.getName() << '\n'; - ++MismatchedEdges; - continue; + BinaryBasicBlock *ToBB = Order[YamlSI.Index]; + if (!BB.getSuccessor(ToBB->getLabel())) { + // Allow for BOLT-removed passthrough blocks to align with DataReader + // behavior. + BinaryBasicBlock *FTSuccessor = BB.getConditionalSuccessor(false); + if (FTSuccessor && FTSuccessor->succ_size() == 1 && + FTSuccessor->getSuccessor(ToBB->getLabel())) { + BinaryBasicBlock::BinaryBranchInfo &FTBI = + FTSuccessor->getBranchInfo(*ToBB); + FTBI.Count += YamlSI.Count; + FTBI.MispredictedCount += YamlSI.Mispreds; + ToBB = FTSuccessor; + } else { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: no successor for block " << BB.getName() + << " that matches index " << YamlSI.Index << " or block " + << ToBB->getName() << '\n'; + ++MismatchedEdges; + continue; + } } - BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(SuccessorBB); + BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB); BI.Count += YamlSI.Count; BI.MispredictedCount += YamlSI.Mispreds; } diff --git a/bolt/test/X86/Inputs/blarge_new_bat_branchentry.preagg.txt b/bolt/test/X86/Inputs/blarge_new_bat_branchentry.preagg.txt new file mode 100644 index 0000000000000..546da92f94dba --- /dev/null +++ b/bolt/test/X86/Inputs/blarge_new_bat_branchentry.preagg.txt @@ -0,0 +1 @@ +B 80010c 800194 1 0 diff --git a/bolt/test/X86/Inputs/blarge_new_bat_order.preagg.txt b/bolt/test/X86/Inputs/blarge_new_bat_order.preagg.txt new file mode 100644 index 0000000000000..e4e1f170343c6 --- /dev/null +++ b/bolt/test/X86/Inputs/blarge_new_bat_order.preagg.txt @@ -0,0 +1,2 @@ +B 800154 401050 20 0 +F 800159 800193 7 diff --git a/bolt/test/X86/bb-with-two-tail-calls.s b/bolt/test/X86/bb-with-two-tail-calls.s index caad7b3d735f5..bb2b0cd4cc23a 100644 --- a/bolt/test/X86/bb-with-two-tail-calls.s +++ b/bolt/test/X86/bb-with-two-tail-calls.s @@ -9,11 +9,11 @@ # RUN: llvm-strip --strip-unneeded %t.o # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib # RUN: llvm-bolt %t.exe -o %t.out --data %t.fdata --lite=0 --dyno-stats \ -# RUN: --print-sctc --print-only=_start 2>&1 | FileCheck %s +# RUN: --print-sctc --print-only=_start -enable-bat 2>&1 | FileCheck %s # CHECK-NOT: Assertion `BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"' failed. # Two tail calls in the same basic block after SCTC: -# CHECK: {{.*}}: ja {{.*}} # TAILCALL # CTCTakenCount: {{.*}} -# CHECK-NEXT: {{.*}}: jmp {{.*}} # TAILCALL +# CHECK: {{.*}}: ja {{.*}} # TAILCALL # Offset: 7 # CTCTakenCount: 4 +# CHECK-NEXT: {{.*}}: jmp {{.*}} # TAILCALL # Offset: 12 .globl _start _start: diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test index b3d8a88394503..e21513b7dfe59 100644 --- a/bolt/test/X86/bolt-address-translation-yaml.test +++ b/bolt/test/X86/bolt-address-translation-yaml.test @@ -5,6 +5,28 @@ RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt \ RUN: --reorder-blocks=ext-tsp --split-functions --split-strategy=cdsplit \ RUN: --reorder-functions=cdsort --enable-bat --dyno-stats --skip-funcs=main \ RUN: 2>&1 | FileCheck --check-prefix WRITE-BAT-CHECK %s +# Check that branch with entry in BAT is accounted for. +RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat_branchentry.preagg.txt \ +RUN: -w %t.yaml -o %t.fdata +RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o %t.null +RUN: FileCheck --input-file %t.yaml --check-prefix BRANCHENTRY-YAML-CHECK %s +RUN: FileCheck --input-file %t.yaml-fdata --check-prefix BRANCHENTRY-YAML-CHECK %s +BRANCHENTRY-YAML-CHECK: - name: SolveCubic +BRANCHENTRY-YAML-CHECK: bid: 0 +BRANCHENTRY-YAML-CHECK: hash: 0x700F19D24600000 +BRANCHENTRY-YAML-CHECK-NEXT: succ: [ { bid: 7, cnt: 1 } +# Check that the order is correct between BAT YAML and FDATA->YAML. +RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat_order.preagg.txt \ +RUN: -w %t.yaml -o %t.fdata +RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o %t.null +RUN: FileCheck --input-file %t.yaml --check-prefix ORDER-YAML-CHECK %s +RUN: FileCheck --input-file %t.yaml-fdata --check-prefix ORDER-YAML-CHECK %s +ORDER-YAML-CHECK: - name: SolveCubic +ORDER-YAML-CHECK: bid: 3 +ORDER-YAML-CHECK: hash: 0xDDA1DC5F69F900AC +ORDER-YAML-CHECK-NEXT: calls: [ { off: 0x26, fid: [[#]], cnt: 20 } ] +ORDER-YAML-CHECK-NEXT: succ: [ { bid: 5, cnt: 7 } +# Large profile test RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat.preagg.txt -w %t.yaml -o %t.fdata \ RUN: 2>&1 | FileCheck --check-prefix READ-BAT-CHECK %s RUN: FileCheck --input-file %t.yaml --check-prefix YAML-BAT-CHECK %s @@ -48,6 +70,10 @@ YAML-BAT-CHECK-NEXT: hash: 0x6AF7E61EA3966722 YAML-BAT-CHECK-NEXT: exec: 25 YAML-BAT-CHECK-NEXT: nblocks: 15 YAML-BAT-CHECK-NEXT: blocks: +YAML-BAT-CHECK-NEXT: - bid: 0 +YAML-BAT-CHECK-NEXT: insns: [[#]] +YAML-BAT-CHECK-NEXT: hash: 0x700F19D24600000 +YAML-BAT-CHECK-NEXT: exec: 25 YAML-BAT-CHECK: - bid: 3 YAML-BAT-CHECK-NEXT: insns: [[#]] YAML-BAT-CHECK-NEXT: hash: 0xDDA1DC5F69F900AC diff --git a/bolt/test/X86/profile-passthrough-block.test b/bolt/test/X86/profile-passthrough-block.test new file mode 100644 index 0000000000000..54920e05d5520 --- /dev/null +++ b/bolt/test/X86/profile-passthrough-block.test @@ -0,0 +1,66 @@ +## This reproduces a bug with BOLT setting incorrect discriminator for +## secondary entry points in YAML profile. + +# REQUIRES: system-linux +# RUN: split-file %s %t +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib +# RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml --profile-ignore-hash -v=1 \ +# RUN: 2>&1 | FileCheck %s + +# CHECK: BOLT-INFO: 1 out of 1 functions in the binary (100.0%) have non-empty execution profile +# CHECK-NOT: BOLT-WARNING: no successor for block .LFT0 that matches index 3 or block .Ltmp0 + +#--- main.s +.globl main +.type main, @function +main: + .cfi_startproc +.LBB00: + pushq %rbp + movq %rsp, %rbp + subq $16, %rsp + testq %rax, %rax + js .LBB03 +.LBB01: + jne .LBB04 +.LBB02: + nop +.LBB03: + xorl %eax, %eax + addq $16, %rsp + popq %rbp + retq +.LBB04: + xorl %eax, %eax + addq $16, %rsp + popq %rbp + retq +## For relocations against .text +.LBB05: + call exit + .cfi_endproc + .size main, .-main + +#--- yaml +--- +header: + profile-version: 1 + binary-name: 'profile-passthrough-block.s.tmp.exe' + binary-build-id: '' + profile-flags: [ lbr ] + profile-origin: branch profile reader + profile-events: '' + dfs-order: false + hash-func: xxh3 +functions: + - name: main + fid: 0 + hash: 0x0000000000000000 + exec: 1 + nblocks: 6 + blocks: + - bid: 1 + insns: 1 + succ: [ { bid: 3, cnt: 1} ] +... diff --git a/bolt/test/runtime/bolt-reserved.cpp b/bolt/test/runtime/bolt-reserved.cpp index 5e93b4f7c3d40..c88b1e284d074 100644 --- a/bolt/test/runtime/bolt-reserved.cpp +++ b/bolt/test/runtime/bolt-reserved.cpp @@ -16,8 +16,8 @@ * not enough for allocating new sections. */ -// RUN: %clang %s -o %t.exe -Wl,--no-eh-frame-hdr -Wl,-q -DTINY -// RUN: not llvm-bolt %t.exe -o %t.bolt.exe 2>&1 | \ +// RUN: %clang %s -o %t.tiny.exe -Wl,--no-eh-frame-hdr -Wl,-q -DTINY +// RUN: not llvm-bolt %t.tiny.exe -o %t.tiny.bolt.exe 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-TINY // CHECK-TINY: BOLT-ERROR: reserved space (1 byte) is smaller than required diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index d6863f90edb6e..9e82130c93609 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10035,12 +10035,6 @@ def warn_new_dangling_initializer_list : Warning< "the allocated initializer list}0 " "will be destroyed at the end of the full-expression">, InGroup; -def warn_unsupported_lifetime_extension : Warning< - "lifetime extension of " - "%select{temporary|backing array of initializer list}0 created " - "by aggregate initialization using a default member initializer " - "is not yet supported; lifetime of %select{temporary|backing array}0 " - "will end at the end of the full-expression">, InGroup; // For non-floating point, expressions of the form x == x or x != x // should result in a warning, since these always evaluate to a constant. diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h index a893e6f4d3d39..ae9ebd9f59154 100644 --- a/clang/include/clang/Basic/IdentifierTable.h +++ b/clang/include/clang/Basic/IdentifierTable.h @@ -738,7 +738,7 @@ class IdentifierTable { II->Entry = &Entry; // If this is the 'import' contextual keyword, mark it as such. - if (Name.equals("import")) + if (Name == "import") II->setModulesImport(true); return *II; diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index d2ece14da0b11..5258bab584f49 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -1504,7 +1504,7 @@ class SourceManager : public RefCountedBase { if (Presumed.isInvalid()) return false; StringRef Filename(Presumed.getFilename()); - return Filename.equals(""); + return Filename == ""; } /// Returns whether \p Loc is located in a file. @@ -1513,7 +1513,7 @@ class SourceManager : public RefCountedBase { if (Presumed.isInvalid()) return false; StringRef Filename(Presumed.getFilename()); - return Filename.equals(""); + return Filename == ""; } /// Returns whether \p Loc is located in a file. @@ -1522,7 +1522,7 @@ class SourceManager : public RefCountedBase { if (Presumed.isInvalid()) return false; StringRef Filename(Presumed.getFilename()); - return Filename.equals(""); + return Filename == ""; } /// Returns if a SourceLocation is in a system header. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 73a2518480e9b..9c4915678f210 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5457,6 +5457,10 @@ def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">, MarshallingInfoFlag>; def pipe : Flag<["-", "--"], "pipe">, HelpText<"Use pipes between commands, when possible">; +// Facebook T92898286 +def post_link_optimize : Flag<["--"], "post-link-optimize">, + HelpText<"Apply post-link optimizations using BOLT">; +// End Facebook T92898286 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">; def prebind : Flag<["-"], "prebind">; def preload : Flag<["-"], "preload">; @@ -6273,9 +6277,9 @@ def mno_gather : Flag<["-"], "mno-gather">, Group, def mno_scatter : Flag<["-"], "mno-scatter">, Group, HelpText<"Disable generation of scatter instructions in auto-vectorization(x86 only)">; def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group, - HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,cf">; + HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">; def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group, - HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,cf">; + HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">; // Features egpr, push2pop2, ppx and ndd are validated with llvm-test-suite && cpu2017 on Intel SDE. // For stability, we turn on these features only for -mapxf. After a feature pass the validation, // we will add it to -mapxf. diff --git a/clang/lib/ARCMigrate/ObjCMT.cpp b/clang/lib/ARCMigrate/ObjCMT.cpp index b9dcfb8951b3e..aaf41dc4039cd 100644 --- a/clang/lib/ARCMigrate/ObjCMT.cpp +++ b/clang/lib/ARCMigrate/ObjCMT.cpp @@ -484,7 +484,7 @@ static void rewriteToObjCProperty(const ObjCMethodDecl *Getter, // Short circuit 'delegate' properties that contain the name "delegate" or // "dataSource", or have exact name "target" to have 'assign' attribute. - if (PropertyName.equals("target") || PropertyName.contains("delegate") || + if (PropertyName == "target" || PropertyName.contains("delegate") || PropertyName.contains("dataSource")) { QualType QT = Getter->getReturnType(); if (!QT->isRealType()) diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp index 954c58c8cb371..d0466902247b4 100644 --- a/clang/lib/AST/Interp/Descriptor.cpp +++ b/clang/lib/AST/Interp/Descriptor.cpp @@ -347,14 +347,6 @@ Descriptor::Descriptor(const DeclTy &D) assert(Source && "Missing source"); } -/// Dummy array. -Descriptor::Descriptor(const DeclTy &D, UnknownSize) - : Source(D), ElemSize(1), Size(UnknownSizeMark), MDSize(0), - AllocSize(MDSize), ElemRecord(nullptr), IsConst(true), IsMutable(false), - IsTemporary(false), IsArray(true), IsDummy(true) { - assert(Source && "Missing source"); -} - QualType Descriptor::getType() const { if (auto *E = asExpr()) return E->getType(); diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h index cd20495c259c7..fcb14e76e7eb0 100644 --- a/clang/lib/AST/Interp/Descriptor.h +++ b/clang/lib/AST/Interp/Descriptor.h @@ -128,7 +128,7 @@ struct Descriptor final { /// Flag indicating if the block is an array. const bool IsArray = false; /// Flag indicating if this is a dummy descriptor. - const bool IsDummy = false; + bool IsDummy = false; /// Storage management methods. const BlockCtorFn CtorFn = nullptr; @@ -162,8 +162,8 @@ struct Descriptor final { /// Allocates a dummy descriptor. Descriptor(const DeclTy &D); - /// Allocates a dummy array descriptor. - Descriptor(const DeclTy &D, UnknownSize); + /// Make this descriptor a dummy descriptor. + void makeDummy() { IsDummy = true; } QualType getType() const; QualType getElemQualType() const; diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 66d30cc3fbaab..a0bf874300120 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -823,9 +823,9 @@ inline bool CmpHelperEQ(InterpState &S, CodePtr OpPC, CompareFn Fn) { // element in the same array are NOT equal. They have the same Base value, // but a different Offset. This is a pretty rare case, so we fix this here // by comparing pointers to the first elements. - if (!LHS.isZero() && !LHS.isDummy() && LHS.isArrayRoot()) + if (!LHS.isZero() && LHS.isArrayRoot()) VL = LHS.atIndex(0).getByteOffset(); - if (!RHS.isZero() && !RHS.isDummy() && RHS.isArrayRoot()) + if (!RHS.isZero() && RHS.isArrayRoot()) VR = RHS.atIndex(0).getByteOffset(); S.Stk.push(BoolT::from(Fn(Compare(VL, VR)))); @@ -1241,14 +1241,16 @@ inline bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) { !CheckNull(S, OpPC, Ptr, CSK_Field)) return false; - if (CheckDummy(S, OpPC, Ptr)) { - if (!CheckExtern(S, OpPC, Ptr)) - return false; - if (!CheckRange(S, OpPC, Ptr, CSK_Field)) - return false; - if (!CheckSubobject(S, OpPC, Ptr, CSK_Field)) - return false; - } + if (!CheckExtern(S, OpPC, Ptr)) + return false; + if (!CheckRange(S, OpPC, Ptr, CSK_Field)) + return false; + if (!CheckSubobject(S, OpPC, Ptr, CSK_Field)) + return false; + + if (Ptr.isBlockPointer() && Off > Ptr.block()->getSize()) + return false; + S.Stk.push(Ptr.atField(Off)); return true; } @@ -2034,11 +2036,6 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) { if (!Ptr.isZero()) { if (!CheckArray(S, OpPC, Ptr)) return false; - - if (Ptr.isDummy()) { - S.Stk.push(Ptr); - return true; - } } if (!OffsetHelper(S, OpPC, Offset, Ptr)) @@ -2055,11 +2052,6 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) { if (!Ptr.isZero()) { if (!CheckArray(S, OpPC, Ptr)) return false; - - if (Ptr.isDummy()) { - S.Stk.push(Ptr); - return true; - } } if (!OffsetHelper(S, OpPC, Offset, Ptr)) @@ -2113,12 +2105,12 @@ inline bool CopyArray(InterpState &S, CodePtr OpPC, uint32_t SrcIndex, uint32_t inline bool ArrayDecay(InterpState &S, CodePtr OpPC) { const Pointer &Ptr = S.Stk.pop(); - if (Ptr.isZero() || Ptr.isDummy()) { + if (Ptr.isZero()) { S.Stk.push(Ptr); return true; } - if (!Ptr.isUnknownSizeArray()) { + if (!Ptr.isUnknownSizeArray() || Ptr.isDummy()) { S.Stk.push(Ptr.atIndex(0)); return true; } diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h index c4d701bc71b7b..79fab05670e96 100644 --- a/clang/lib/AST/Interp/Pointer.h +++ b/clang/lib/AST/Interp/Pointer.h @@ -226,8 +226,7 @@ class Pointer { return *this; // If at base, point to an array of base types. - if (asBlockPointer().Base == 0 || - asBlockPointer().Base == sizeof(InlineDescriptor)) + if (isRoot()) return Pointer(asBlockPointer().Pointee, RootPtrMark, 0); // Step into the containing array, if inside one. @@ -306,10 +305,8 @@ class Pointer { const Descriptor *getFieldDesc() const { if (isIntegralPointer()) return asIntPointer().Desc; - if (isBlockPointer() && - (asBlockPointer().Base == 0 || - asBlockPointer().Base == sizeof(InlineDescriptor) || - asBlockPointer().Base == RootPtrMark)) + + if (isRoot()) return getDeclDesc(); return getInlineDesc()->Desc; } @@ -390,8 +387,7 @@ class Pointer { // If this points inside a dummy block, return true. // FIXME: This might change in the future. If it does, we need // to set the proper Ctor/Dtor functions for dummy Descriptors. - if (asBlockPointer().Base != 0 && - asBlockPointer().Base != sizeof(InlineDescriptor) && isDummy()) + if (!isRoot() && isDummy()) return true; return getFieldDesc()->isUnknownSizeArray(); } @@ -403,9 +399,11 @@ class Pointer { } /// Pointer points directly to a block. bool isRoot() const { - return (asBlockPointer().Base == 0 || - asBlockPointer().Base == RootPtrMark) && - Offset == 0; + if (isZero() || isIntegralPointer()) + return true; + return (asBlockPointer().Base == + asBlockPointer().Pointee->getDescriptor()->getMetadataSize() || + asBlockPointer().Base == 0); } /// If this pointer has an InlineDescriptor we can use to initialize. bool canBeInitialized() const { @@ -487,9 +485,7 @@ class Pointer { bool isActive() const { if (!isBlockPointer()) return true; - return asBlockPointer().Base == 0 || - asBlockPointer().Base == sizeof(InlineDescriptor) || - getInlineDesc()->IsActive; + return isRoot() || getInlineDesc()->IsActive; } /// Checks if a structure is a base class. bool isBaseClass() const { return isField() && getInlineDesc()->IsBase; } @@ -508,10 +504,7 @@ class Pointer { bool isConst() const { if (isIntegralPointer()) return true; - return (asBlockPointer().Base == 0 || - asBlockPointer().Base == sizeof(InlineDescriptor)) - ? getDeclDesc()->IsConst - : getInlineDesc()->IsConst; + return isRoot() ? getDeclDesc()->IsConst : getInlineDesc()->IsConst; } /// Returns the declaration ID. @@ -567,6 +560,9 @@ class Pointer { if (!asBlockPointer().Pointee) return false; + if (isDummy()) + return false; + return isElementPastEnd() || getSize() == getOffset(); } diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp index 02075c20cf556..87c767f85e799 100644 --- a/clang/lib/AST/Interp/Program.cpp +++ b/clang/lib/AST/Interp/Program.cpp @@ -144,17 +144,20 @@ std::optional Program::getOrCreateDummy(const ValueDecl *VD) { if (auto It = DummyVariables.find(VD); It != DummyVariables.end()) return It->second; - // Create dummy descriptor. - // We create desriptors of 'array of unknown size' if the type is an array - // type _and_ the size isn't known (it's not a ConstantArrayType). If the size - // is known however, we create a regular dummy pointer. Descriptor *Desc; - if (const auto *AT = VD->getType()->getAsArrayTypeUnsafe(); - AT && !isa(AT)) - Desc = allocateDescriptor(VD, Descriptor::UnknownSize{}); + if (std::optional T = Ctx.classify(VD->getType())) + Desc = createDescriptor(VD, *T, std::nullopt, true, false); else + Desc = createDescriptor(VD, VD->getType().getTypePtr(), std::nullopt, true, + false); + if (!Desc) Desc = allocateDescriptor(VD); + assert(Desc); + Desc->makeDummy(); + + assert(Desc->isDummy()); + // Allocate a block for storage. unsigned I = Globals.size(); @@ -310,8 +313,7 @@ Record *Program::getOrCreateRecord(const RecordDecl *RD) { for (const FieldDecl *FD : RD->fields()) { // Note that we DO create fields and descriptors // for unnamed bitfields here, even though we later ignore - // them everywhere. That's because so the FieldDecl's - // getFieldIndex() matches. + // them everywhere. That's so the FieldDecl's getFieldIndex() matches. // Reserve space for the field's descriptor and the offset. BaseSize += align(sizeof(InlineDescriptor)); @@ -344,6 +346,7 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty, Descriptor::MetadataSize MDSize, bool IsConst, bool IsTemporary, bool IsMutable, const Expr *Init) { + // Classes and structures. if (const auto *RT = Ty->getAs()) { if (const auto *Record = getOrCreateRecord(RT->getDecl())) diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp index fec8ce13e8c44..dd3b38fabb550 100644 --- a/clang/lib/AST/PrintfFormatString.cpp +++ b/clang/lib/AST/PrintfFormatString.cpp @@ -146,13 +146,13 @@ static PrintfSpecifierResult ParsePrintfSpecifier(FormatStringHandler &H, if (Warn && (Size == 0 || Size > 8)) H.handleInvalidMaskType(MaskType); FS.setMaskType(MaskType); - } else if (MatchedStr.equals("sensitive")) + } else if (MatchedStr == "sensitive") PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsSensitive; else if (PrivacyFlags != - clang::analyze_os_log::OSLogBufferItem::IsSensitive && - MatchedStr.equals("private")) + clang::analyze_os_log::OSLogBufferItem::IsSensitive && + MatchedStr == "private") PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPrivate; - else if (PrivacyFlags == 0 && MatchedStr.equals("public")) + else if (PrivacyFlags == 0 && MatchedStr == "public") PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPublic; } else { size_t CommaOrBracePos = diff --git a/clang/lib/ASTMatchers/Dynamic/Marshallers.cpp b/clang/lib/ASTMatchers/Dynamic/Marshallers.cpp index cf9ae7c974a62..37c91abb5c839 100644 --- a/clang/lib/ASTMatchers/Dynamic/Marshallers.cpp +++ b/clang/lib/ASTMatchers/Dynamic/Marshallers.cpp @@ -21,7 +21,7 @@ getBestGuess(llvm::StringRef Search, llvm::ArrayRef Allowed, llvm::StringRef Res; for (const llvm::StringRef &Item : Allowed) { if (Item.equals_insensitive(Search)) { - assert(!Item.equals(Search) && "This should be handled earlier on."); + assert(Item != Search && "This should be handled earlier on."); MaxEditDistance = 1; Res = Item; continue; @@ -41,7 +41,7 @@ getBestGuess(llvm::StringRef Search, llvm::ArrayRef Allowed, if (!NoPrefix.consume_front(DropPrefix)) continue; if (NoPrefix.equals_insensitive(Search)) { - if (NoPrefix.equals(Search)) + if (NoPrefix == Search) return Item.str(); MaxEditDistance = 1; Res = Item; diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp index 3467847ac1672..b116abbe034f7 100644 --- a/clang/lib/Basic/Builtins.cpp +++ b/clang/lib/Basic/Builtins.cpp @@ -64,7 +64,7 @@ bool Builtin::Context::isBuiltinFunc(llvm::StringRef FuncName) { bool InStdNamespace = FuncName.consume_front("std-"); for (unsigned i = Builtin::NotBuiltin + 1; i != Builtin::FirstTSBuiltin; ++i) { - if (FuncName.equals(BuiltinInfo[i].Name) && + if (FuncName == BuiltinInfo[i].Name && (bool)strchr(BuiltinInfo[i].Attributes, 'z') == InStdNamespace) return strchr(BuiltinInfo[i].Attributes, 'f') != nullptr; } diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 0208ccc31bd7f..10136b4cd9435 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -851,8 +851,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd, // When the diagnostic string is only "%0", the entire string is being given // by an outside source. Remove unprintable characters from this string // and skip all the other string processing. - if (DiagEnd - DiagStr == 2 && - StringRef(DiagStr, DiagEnd - DiagStr).equals("%0") && + if (DiagEnd - DiagStr == 2 && StringRef(DiagStr, DiagEnd - DiagStr) == "%0" && getArgKind(0) == DiagnosticsEngine::ak_std_string) { const std::string &S = getArgStdStr(0); EscapeStringForDiagnostic(S, OutStr); diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp index a0adfbf61840e..2b906463931d3 100644 --- a/clang/lib/Basic/LangOptions.cpp +++ b/clang/lib/Basic/LangOptions.cpp @@ -48,7 +48,7 @@ void LangOptions::resetNonModularOptions() { bool LangOptions::isNoBuiltinFunc(StringRef FuncName) const { for (unsigned i = 0, e = NoBuiltinFuncs.size(); i != e; ++i) - if (FuncName.equals(NoBuiltinFuncs[i])) + if (FuncName == NoBuiltinFuncs[i]) return true; return false; } diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index 877799c66ec4f..7423626d7c3cb 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -173,8 +173,7 @@ bool ARMTargetInfo::supportsThumb() const { } bool ARMTargetInfo::supportsThumb2() const { - return CPUAttr.equals("6T2") || - (ArchVersion >= 7 && !CPUAttr.equals("8M_BASE")); + return CPUAttr == "6T2" || (ArchVersion >= 7 && CPUAttr != "8M_BASE"); } StringRef ARMTargetInfo::getCPUAttr() const { @@ -1162,7 +1161,7 @@ bool ARMTargetInfo::validateAsmConstraint( return true; case 'j': // An immediate integer between 0 and 65535 (valid for MOVW) // only available in ARMv6T2 and above - if (CPUAttr.equals("6T2") || ArchVersion >= 7) { + if (CPUAttr == "6T2" || ArchVersion >= 7) { Info.setRequiresImmediate(0, 65535); return true; } diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 496b6131d09bb..fc23c30c68523 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -360,7 +360,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { bool hasBitIntType() const override { return true; } bool isSPRegName(StringRef RegName) const override { - return RegName.equals("r1") || RegName.equals("x1"); + return RegName == "r1" || RegName == "x1"; } // We support __builtin_cpu_supports/__builtin_cpu_is on targets that diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h index 73d3aa01a043f..3bc6f2c1d3083 100644 --- a/clang/lib/Basic/Targets/SystemZ.h +++ b/clang/lib/Basic/Targets/SystemZ.h @@ -84,7 +84,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo { ArrayRef getGCCAddlRegNames() const override; bool isSPRegName(StringRef RegName) const override { - return RegName.equals("r15"); + return RegName == "r15"; } bool validateAsmConstraint(const char *&Name, diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 67e2126cf766b..b823eaf6ce336 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -458,6 +458,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasNDD = true; } else if (Feature == "+ccmp") { HasCCMP = true; + } else if (Feature == "+nf") { + HasNF = true; } else if (Feature == "+cf") { HasCF = true; } @@ -969,6 +971,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__NDD__"); if (HasCCMP) Builder.defineMacro("__CCMP__"); + if (HasNF) + Builder.defineMacro("__NF__"); if (HasCF) Builder.defineMacro("__CF__"); // Condition here is aligned with the feature set of mapxf in Options.td @@ -1174,6 +1178,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("ppx", true) .Case("ndd", true) .Case("ccmp", true) + .Case("nf", true) .Case("cf", true) .Default(false); } @@ -1296,6 +1301,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("ppx", HasPPX) .Case("ndd", HasNDD) .Case("ccmp", HasCCMP) + .Case("nf", HasNF) .Case("cf", HasCF) .Default(false); } diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index c14e4d5f433d8..6a0a6cb84203d 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -173,6 +173,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasPPX = false; bool HasNDD = false; bool HasCCMP = false; + bool HasNF = false; bool HasCF = false; protected: @@ -218,7 +219,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { ArrayRef getGCCAddlRegNames() const override; bool isSPRegName(StringRef RegName) const override { - return RegName.equals("esp") || RegName.equals("rsp"); + return RegName == "esp" || RegName == "rsp"; } bool supportsCpuSupports() const override { return true; } @@ -246,7 +247,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool &HasSizeMismatch) const override { // esp and ebp are the only 32-bit registers the x86 backend can currently // handle. - if (RegName.equals("esp") || RegName.equals("ebp")) { + if (RegName == "esp" || RegName == "ebp") { // Check that the register size is 32-bit. HasSizeMismatch = RegSize != 32; return true; @@ -802,7 +803,7 @@ class LLVM_LIBRARY_VISIBILITY X86_64TargetInfo : public X86TargetInfo { bool &HasSizeMismatch) const override { // rsp and rbp are the only 64-bit registers the x86 backend can currently // handle. - if (RegName.equals("rsp") || RegName.equals("rbp")) { + if (RegName == "rsp" || RegName == "rbp") { // Check that the register size is 64-bit. HasSizeMismatch = RegSize != 64; return true; diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp index 868b1ab98e048..5169be204c14d 100644 --- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp +++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp @@ -75,7 +75,7 @@ struct CGRecordLowering { // sentinel member type that ensures correct rounding. struct MemberInfo { CharUnits Offset; - enum InfoKind { VFPtr, VBPtr, Field, Base, VBase, Scissor } Kind; + enum InfoKind { VFPtr, VBPtr, Field, Base, VBase } Kind; llvm::Type *Data; union { const FieldDecl *FD; @@ -197,7 +197,7 @@ struct CGRecordLowering { const CXXRecordDecl *Query) const; void calculateZeroInit(); CharUnits calculateTailClippingOffset(bool isNonVirtualBaseType) const; - void checkBitfieldClipping() const; + void checkBitfieldClipping(bool isNonVirtualBaseType) const; /// Determines if we need a packed llvm struct. void determinePacked(bool NVBaseType); /// Inserts padding everywhere it's needed. @@ -299,8 +299,8 @@ void CGRecordLowering::lower(bool NVBaseType) { accumulateVBases(); } llvm::stable_sort(Members); + checkBitfieldClipping(NVBaseType); Members.push_back(StorageInfo(Size, getIntNType(8))); - checkBitfieldClipping(); determinePacked(NVBaseType); insertPadding(); Members.pop_back(); @@ -894,8 +894,6 @@ CGRecordLowering::calculateTailClippingOffset(bool isNonVirtualBaseType) const { } void CGRecordLowering::accumulateVBases() { - Members.push_back(MemberInfo(calculateTailClippingOffset(false), - MemberInfo::Scissor, nullptr, RD)); for (const auto &Base : RD->vbases()) { const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl(); if (BaseDecl->isEmpty()) @@ -950,18 +948,19 @@ void CGRecordLowering::calculateZeroInit() { } // Verify accumulateBitfields computed the correct storage representations. -void CGRecordLowering::checkBitfieldClipping() const { +void CGRecordLowering::checkBitfieldClipping(bool IsNonVirtualBaseType) const { #ifndef NDEBUG + auto ScissorOffset = calculateTailClippingOffset(IsNonVirtualBaseType); auto Tail = CharUnits::Zero(); for (const auto &M : Members) { - // Only members with data and the scissor can cut into tail padding. - if (!M.Data && M.Kind != MemberInfo::Scissor) + // Only members with data could possibly overlap. + if (!M.Data) continue; assert(M.Offset >= Tail && "Bitfield access unit is not clipped"); - Tail = M.Offset; - if (M.Data) - Tail += getSize(M.Data); + Tail = M.Offset + getSize(M.Data); + assert((Tail <= ScissorOffset || M.Offset >= ScissorOffset) && + "Bitfield straddles scissor offset"); } #endif } diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 477814140a9e2..9f16fcb438557 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -2765,10 +2765,9 @@ llvm::Value *CodeGenFunction::FormAArch64ResolverCondition( // only for features that are not enabled in the target. The exception is // for features whose extension instructions are executed as NOP on targets // without extension support. - if (!getContext().getTargetInfo().hasFeature(Feature) || - Feature.equals("bti") || Feature.equals("memtag") || - Feature.equals("memtag2") || Feature.equals("memtag3") || - Feature.equals("dgh")) + if (!getContext().getTargetInfo().hasFeature(Feature) || Feature == "bti" || + Feature == "memtag" || Feature == "memtag2" || Feature == "memtag3" || + Feature == "dgh") CondFeatures.push_back(Feature); } if (!CondFeatures.empty()) { diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index 53e26a9f8e229..8295d001ec6f7 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -273,7 +273,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, for (StringRef Value : A->getValues()) { if (Value == "egpr" || Value == "push2pop2" || Value == "ppx" || - Value == "ndd" || Value == "ccmp" || Value == "cf") { + Value == "ndd" || Value == "ccmp" || Value == "nf" || + Value == "cf") { Features.push_back( Args.MakeArgString((IsNegative ? "-" : "+") + Value)); continue; diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 9849c59685cca..e307620a4e0ab 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -672,12 +672,41 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, } } + // Facebook T92898286 + if (Args.hasArg(options::OPT_post_link_optimize)) + CmdArgs.push_back("-q"); + // End Facebook T92898286 + Args.AddAllArgs(CmdArgs, options::OPT_T); const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath()); C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs, Output)); + // Facebook T92898286 + if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename()) + return; + + const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv")); + ArgStringList MoveCmdArgs; + MoveCmdArgs.push_back(Output.getFilename()); + const char *PreBoltBin = + Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt"); + MoveCmdArgs.push_back(PreBoltBin); + C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), + MvExec, MoveCmdArgs, std::nullopt)); + + ArgStringList BoltCmdArgs; + const char *BoltExec = + Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt")); + BoltCmdArgs.push_back(PreBoltBin); + BoltCmdArgs.push_back("-reorder-blocks=reverse"); + BoltCmdArgs.push_back("-update-debug-sections"); + BoltCmdArgs.push_back("-o"); + BoltCmdArgs.push_back(Output.getFilename()); + C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), + BoltExec, BoltCmdArgs, std::nullopt)); + // End Facebook T92898286 } void tools::gnutools::Assembler::ConstructJob(Compilation &C, diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index ad0e2c3c620c3..6b9fbfe0ebf53 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -1422,7 +1422,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { // the next line. if (State.Line->InPragmaDirective) { FormatToken *PragmaType = State.Line->First->Next->Next; - if (PragmaType && PragmaType->TokenText.equals("omp")) + if (PragmaType && PragmaType->TokenText == "omp") return CurrentState.Indent + Style.ContinuationIndentWidth; } diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index c5c79dd0f883e..52005a6c881f3 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -3770,7 +3770,7 @@ reformat(const FormatStyle &Style, StringRef Code, tooling::Replacements NonNoOpFixes; for (const tooling::Replacement &Fix : Fixes) { StringRef OriginalCode = Code.substr(Fix.getOffset(), Fix.getLength()); - if (!OriginalCode.equals(Fix.getReplacementText())) { + if (OriginalCode != Fix.getReplacementText()) { auto Err = NonNoOpFixes.add(Fix); if (Err) { llvm::errs() << "Error adding replacements : " diff --git a/clang/lib/Format/QualifierAlignmentFixer.cpp b/clang/lib/Format/QualifierAlignmentFixer.cpp index a904f0b773c6d..077ce5e597a22 100644 --- a/clang/lib/Format/QualifierAlignmentFixer.cpp +++ b/clang/lib/Format/QualifierAlignmentFixer.cpp @@ -281,7 +281,7 @@ const FormatToken *LeftRightQualifierAlignmentFixer::analyzeRight( const FormatToken *LastSimpleTypeSpecifier = TypeToken; while (isQualifierOrType(LastSimpleTypeSpecifier->getNextNonComment(), - LangOpts)) { + &LangOpts)) { LastSimpleTypeSpecifier = LastSimpleTypeSpecifier->getNextNonComment(); } @@ -414,7 +414,7 @@ const FormatToken *LeftRightQualifierAlignmentFixer::analyzeLeft( const FormatToken *LastSimpleTypeSpecifier = TypeToken; while (isConfiguredQualifierOrType( LastSimpleTypeSpecifier->getPreviousNonComment(), - ConfiguredQualifierTokens, LangOpts)) { + ConfiguredQualifierTokens, &LangOpts)) { LastSimpleTypeSpecifier = LastSimpleTypeSpecifier->getPreviousNonComment(); } @@ -613,16 +613,18 @@ void prepareLeftRightOrderingForQualifierAlignmentFixer( } bool LeftRightQualifierAlignmentFixer::isQualifierOrType( - const FormatToken *Tok, const LangOptions &LangOpts) { - return Tok && (Tok->isTypeName(LangOpts) || Tok->is(tok::kw_auto) || - isQualifier(Tok)); + const FormatToken *Tok, const LangOptions *LangOpts) { + return Tok && + (Tok->isTypeName(LangOpts ? *LangOpts : getFormattingLangOpts()) || + Tok->is(tok::kw_auto) || isQualifier(Tok)); } bool LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( const FormatToken *Tok, const std::vector &Qualifiers, - const LangOptions &LangOpts) { - return Tok && (Tok->isTypeName(LangOpts) || Tok->is(tok::kw_auto) || - isConfiguredQualifier(Tok, Qualifiers)); + const LangOptions *LangOpts) { + return Tok && + (Tok->isTypeName(LangOpts ? *LangOpts : getFormattingLangOpts()) || + Tok->is(tok::kw_auto) || isConfiguredQualifier(Tok, Qualifiers)); } // If a token is an identifier and it's upper case, it could diff --git a/clang/lib/Format/QualifierAlignmentFixer.h b/clang/lib/Format/QualifierAlignmentFixer.h index 710fa2dc00309..97fcb42e4b2e5 100644 --- a/clang/lib/Format/QualifierAlignmentFixer.h +++ b/clang/lib/Format/QualifierAlignmentFixer.h @@ -72,11 +72,11 @@ class LeftRightQualifierAlignmentFixer : public TokenAnalyzer { // Is the Token a simple or qualifier type static bool isQualifierOrType(const FormatToken *Tok, - const LangOptions &LangOpts); + const LangOptions *LangOpts = nullptr); static bool isConfiguredQualifierOrType(const FormatToken *Tok, const std::vector &Qualifiers, - const LangOptions &LangOpts); + const LangOptions *LangOpts = nullptr); // Is the Token likely a Macro static bool isPossibleMacro(const FormatToken *Tok); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index dbb5f5662ebf1..14ee02c4cd582 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1969,7 +1969,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << A->getValue(); else if (Val == llvm::FunctionReturnThunksKind::Extern && - Args.getLastArgValue(OPT_mcmodel_EQ).equals("large")) + Args.getLastArgValue(OPT_mcmodel_EQ) == "large") Diags.Report(diag::err_drv_argument_not_allowed_with) << A->getAsString(Args) << Args.getLastArg(OPT_mcmodel_EQ)->getAsString(Args); diff --git a/clang/lib/Frontend/ModuleDependencyCollector.cpp b/clang/lib/Frontend/ModuleDependencyCollector.cpp index b88cb60ebdd2a..e2883f1e027e4 100644 --- a/clang/lib/Frontend/ModuleDependencyCollector.cpp +++ b/clang/lib/Frontend/ModuleDependencyCollector.cpp @@ -105,7 +105,7 @@ static bool isCaseSensitivePath(StringRef Path) { // already expects when sensitivity isn't setup. for (auto &C : Path) UpperDest.push_back(toUppercase(C)); - if (!llvm::sys::fs::real_path(UpperDest, RealDest) && Path.equals(RealDest)) + if (!llvm::sys::fs::real_path(UpperDest, RealDest) && Path == RealDest) return false; return true; } diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 82eb47bcd5bf9..8e7386449dced 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -183,7 +183,7 @@ static MacroDiag shouldWarnOnMacroDef(Preprocessor &PP, IdentifierInfo *II) { return isFeatureTestMacro(Text) ? MD_NoWarn : MD_ReservedMacro; if (II->isKeyword(Lang)) return MD_KeywordDef; - if (Lang.CPlusPlus11 && (Text.equals("override") || Text.equals("final"))) + if (Lang.CPlusPlus11 && (Text == "override" || Text == "final")) return MD_KeywordDef; return MD_NoWarn; } @@ -2807,7 +2807,7 @@ static bool isConfigurationPattern(Token &MacroName, MacroInfo *MI, if (TrimmedValue.ends_with("__")) TrimmedValue = TrimmedValue.drop_back(2); } - return TrimmedValue.equals(MacroText); + return TrimmedValue == MacroText; } else { return false; } diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 5b5fc02ad4023..7fbaee5690bdf 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -3889,7 +3889,7 @@ void Parser::ParseDeclarationSpecifiers( // parse errors if this really is a __declspec attribute. Attempt to // recognize that scenario and recover gracefully. if (!getLangOpts().DeclSpecKeyword && Tok.is(tok::identifier) && - Tok.getIdentifierInfo()->getName().equals("__declspec")) { + Tok.getIdentifierInfo()->getName() == "__declspec") { Diag(Loc, diag::err_ms_attributes_not_enabled); // The next token should be an open paren. If it is, eat the entire diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 96c9708c3711b..65ddebca49bc6 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -4548,9 +4548,9 @@ static bool IsBuiltInOrStandardCXX11Attribute(IdentifierInfo *AttrName, case ParsedAttr::AT_Unlikely: return true; case ParsedAttr::AT_WarnUnusedResult: - return !ScopeName && AttrName->getName().equals("nodiscard"); + return !ScopeName && AttrName->getName() == "nodiscard"; case ParsedAttr::AT_Unused: - return !ScopeName && AttrName->getName().equals("maybe_unused"); + return !ScopeName && AttrName->getName() == "maybe_unused"; default: return false; } diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp index e9c8d6dca7bf5..4b72afe9986e5 100644 --- a/clang/lib/Parse/ParseHLSL.cpp +++ b/clang/lib/Parse/ParseHLSL.cpp @@ -174,7 +174,7 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs, ArgExprs.push_back(ParseIdentifierLoc()); // Add numeric_constant for fix-it. - if (SpaceStr.equals("space") && Tok.is(tok::numeric_constant)) + if (SpaceStr == "space" && Tok.is(tok::numeric_constant)) fixSeparateAttrArgAndNumber(SpaceStr, SpaceLoc, Tok, ArgExprs, *this, Actions.Context, PP); } diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 18ba1185ee8de..53eabe0c662e5 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -740,7 +740,7 @@ static bool parseDeclareSimdClauses( BS = Out; BSRange = SourceRange(Tok.getLocation(), Tok.getEndLoc()); P.ConsumeToken(); - } else if (ClauseName.equals("simdlen")) { + } else if (ClauseName == "simdlen") { if (SimdLen.isUsable()) { P.Diag(Tok, diag::err_omp_more_one_clause) << getOpenMPDirectiveName(OMPD_declare_simd) << ClauseName << 0; @@ -1106,7 +1106,7 @@ static ExprResult parseContextScore(Parser &P) { llvm::SmallString<16> Buffer; StringRef SelectorName = P.getPreprocessor().getSpelling(P.getCurToken(), Buffer); - if (!SelectorName.equals("score")) + if (SelectorName != "score") return ScoreExpr; (void)P.ConsumeToken(); SourceLocation RLoc; diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 3f0ab10646fe5..87aa0cacc2496 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -4049,18 +4049,17 @@ unsigned clang::getMacroUsagePriority(StringRef MacroName, unsigned Priority = CCP_Macro; // Treat the "nil", "Nil" and "NULL" macros as null pointer constants. - if (MacroName.equals("nil") || MacroName.equals("NULL") || - MacroName.equals("Nil")) { + if (MacroName == "nil" || MacroName == "NULL" || MacroName == "Nil") { Priority = CCP_Constant; if (PreferredTypeIsPointer) Priority = Priority / CCF_SimilarTypeMatch; } // Treat "YES", "NO", "true", and "false" as constants. - else if (MacroName.equals("YES") || MacroName.equals("NO") || - MacroName.equals("true") || MacroName.equals("false")) + else if (MacroName == "YES" || MacroName == "NO" || MacroName == "true" || + MacroName == "false") Priority = CCP_Constant; // Treat "bool" as a type. - else if (MacroName.equals("bool")) + else if (MacroName == "bool") Priority = CCP_Type + (LangOpts.ObjC ? CCD_bool_in_ObjC : 0); return Priority; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 590f37837eb2d..fb913034bd836 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -735,8 +735,8 @@ void Sema::DiagnoseUnknownTypeName(IdentifierInfo *&II, << II, CanRecover); } else if (DeclContext *DC = computeDeclContext(*SS, false)) { std::string CorrectedStr(Corrected.getAsString(getLangOpts())); - bool DroppedSpecifier = Corrected.WillReplaceSpecifier() && - II->getName().equals(CorrectedStr); + bool DroppedSpecifier = + Corrected.WillReplaceSpecifier() && II->getName() == CorrectedStr; diagnoseTypo(Corrected, PDiag(IsTemplateName ? diag::err_no_member_template_suggest @@ -1007,7 +1007,7 @@ Sema::NameClassification Sema::ClassifyName(Scope *S, CXXScopeSpec &SS, } else {// FIXME: is this even reachable? Test it. std::string CorrectedStr(Corrected.getAsString(getLangOpts())); bool DroppedSpecifier = Corrected.WillReplaceSpecifier() && - Name->getName().equals(CorrectedStr); + Name->getName() == CorrectedStr; diagnoseTypo(Corrected, PDiag(QualifiedDiag) << Name << computeDeclContext(SS, false) << DroppedSpecifier << SS.getRange()); @@ -16076,7 +16076,7 @@ static void diagnoseImplicitlyRetainedSelf(Sema &S) { static bool methodHasName(const FunctionDecl *FD, StringRef Name) { return isa(FD) && FD->param_empty() && - FD->getDeclName().isIdentifier() && FD->getName().equals(Name); + FD->getDeclName().isIdentifier() && FD->getName() == Name; } bool Sema::CanBeGetReturnObject(const FunctionDecl *FD) { diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index d77b9507066b0..53238d355ea09 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -12234,8 +12234,8 @@ static bool TryNamespaceTypoCorrection(Sema &S, LookupResult &R, Scope *Sc, DiagnoseInvisibleNamespace(Corrected, S); } else if (DeclContext *DC = S.computeDeclContext(SS, false)) { std::string CorrectedStr(Corrected.getAsString(S.getLangOpts())); - bool DroppedSpecifier = Corrected.WillReplaceSpecifier() && - Ident->getName().equals(CorrectedStr); + bool DroppedSpecifier = + Corrected.WillReplaceSpecifier() && Ident->getName() == CorrectedStr; S.diagnoseTypo(Corrected, S.PDiag(diag::err_using_directive_member_suggest) << Ident << DC << DroppedSpecifier << SS.getRange(), diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index c688cb21f2364..bb4b116fd73ca 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -5777,10 +5777,9 @@ ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc, Res = Immediate.TransformInitializer(Param->getInit(), /*NotCopy=*/false); }); - if (Res.isInvalid()) - return ExprError(); - Res = ConvertParamDefaultArgument(Param, Res.get(), - Res.get()->getBeginLoc()); + if (Res.isUsable()) + Res = ConvertParamDefaultArgument(Param, Res.get(), + Res.get()->getBeginLoc()); if (Res.isInvalid()) return ExprError(); Init = Res.get(); @@ -5816,7 +5815,7 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { Expr *Init = nullptr; bool NestedDefaultChecking = isCheckingDefaultArgumentOrInitializer(); - + bool InLifetimeExtendingContext = isInLifetimeExtendingContext(); EnterExpressionEvaluationContext EvalContext( *this, ExpressionEvaluationContext::PotentiallyEvaluated, Field); @@ -5851,19 +5850,35 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { ImmediateCallVisitor V(getASTContext()); if (!NestedDefaultChecking) V.TraverseDecl(Field); - if (V.HasImmediateCalls) { + + // CWG1815 + // Support lifetime extension of temporary created by aggregate + // initialization using a default member initializer. We should always rebuild + // the initializer if it contains any temporaries (if the initializer + // expression is an ExprWithCleanups). Then make sure the normal lifetime + // extension code recurses into the default initializer and does lifetime + // extension when warranted. + bool ContainsAnyTemporaries = + isa_and_present(Field->getInClassInitializer()); + if (V.HasImmediateCalls || InLifetimeExtendingContext || + ContainsAnyTemporaries) { ExprEvalContexts.back().DelayedDefaultInitializationContext = {Loc, Field, CurContext}; ExprEvalContexts.back().IsCurrentlyCheckingDefaultArgumentOrInitializer = NestedDefaultChecking; - + // Pass down lifetime extending flag, and collect temporaries in + // CreateMaterializeTemporaryExpr when we rewrite the call argument. + keepInLifetimeExtendingContext(); EnsureImmediateInvocationInDefaultArgs Immediate(*this); ExprResult Res; + + // Rebuild CXXDefaultInitExpr might cause diagnostics. + SFINAETrap Trap(*this); runWithSufficientStackSpace(Loc, [&] { Res = Immediate.TransformInitializer(Field->getInClassInitializer(), /*CXXDirectInit=*/false); }); - if (!Res.isInvalid()) + if (Res.isUsable()) Res = ConvertMemberDefaultInitExpression(Field, Res.get(), Loc); if (Res.isInvalid()) { Field->setInvalidDecl(); diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index c8049ae581f84..fe4a698a612ea 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -8065,11 +8065,6 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, enum PathLifetimeKind { /// Lifetime-extend along this path. Extend, - /// We should lifetime-extend, but we don't because (due to technical - /// limitations) we can't. This happens for default member initializers, - /// which we don't clone for every use, so we don't have a unique - /// MaterializeTemporaryExpr to update. - ShouldExtend, /// Do not lifetime extend along this path. NoExtend }; @@ -8081,7 +8076,7 @@ shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) { PathLifetimeKind Kind = PathLifetimeKind::Extend; for (auto Elem : Path) { if (Elem.Kind == IndirectLocalPathEntry::DefaultInit) - Kind = PathLifetimeKind::ShouldExtend; + Kind = PathLifetimeKind::Extend; else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit) return PathLifetimeKind::NoExtend; } @@ -8201,18 +8196,6 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity, ExtendingEntity->allocateManglingNumber()); // Also visit the temporaries lifetime-extended by this initializer. return true; - - case PathLifetimeKind::ShouldExtend: - // We're supposed to lifetime-extend the temporary along this path (per - // the resolution of DR1815), but we don't support that yet. - // - // FIXME: Properly handle this situation. Perhaps the easiest approach - // would be to clone the initializer expression on each use that would - // lifetime extend its temporaries. - Diag(DiagLoc, diag::warn_unsupported_lifetime_extension) - << RK << DiagRange; - break; - case PathLifetimeKind::NoExtend: // If the path goes through the initialization of a variable or field, // it can't possibly reach a temporary created in this full-expression. diff --git a/clang/lib/Sema/SemaStmtAsm.cpp b/clang/lib/Sema/SemaStmtAsm.cpp index 83351b703c153..32d42f3c3f3bb 100644 --- a/clang/lib/Sema/SemaStmtAsm.cpp +++ b/clang/lib/Sema/SemaStmtAsm.cpp @@ -829,7 +829,7 @@ bool Sema::LookupInlineAsmField(StringRef Base, StringRef Member, NamedDecl *FoundDecl = nullptr; // MS InlineAsm uses 'this' as a base - if (getLangOpts().CPlusPlus && Base.equals("this")) { + if (getLangOpts().CPlusPlus && Base == "this") { if (const Type *PT = getCurrentThisType().getTypePtrOrNull()) FoundDecl = PT->getPointeeType()->getAsTagDecl(); } else { diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 480c0103ae335..0e7bd8dd89573 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -2679,7 +2679,7 @@ struct ConvertConstructorToDeductionGuideTransform { // placeholder to indicate there is a default argument. QualType ParamTy = NewDI->getType(); NewDefArg = new (SemaRef.Context) - OpaqueValueExpr(OldParam->getDefaultArg()->getBeginLoc(), + OpaqueValueExpr(OldParam->getDefaultArgRange().getBegin(), ParamTy.getNonLValueExprType(SemaRef.Context), ParamTy->isLValueReferenceType() ? VK_LValue : ParamTy->isRValueReferenceType() ? VK_XValue @@ -4268,8 +4268,8 @@ checkBuiltinTemplateIdType(Sema &SemaRef, BuiltinTemplateDecl *BTD, /// Determine whether this alias template is "enable_if_t". /// libc++ >=14 uses "__enable_if_t" in C++11 mode. static bool isEnableIfAliasTemplate(TypeAliasTemplateDecl *AliasTemplate) { - return AliasTemplate->getName().equals("enable_if_t") || - AliasTemplate->getName().equals("__enable_if_t"); + return AliasTemplate->getName() == "enable_if_t" || + AliasTemplate->getName() == "__enable_if_t"; } /// Collect all of the separable terms in the given condition, which diff --git a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp index d17f5ddf07055..a0190c30bfd28 100644 --- a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp @@ -1065,15 +1065,14 @@ void GenericTaintChecker::taintUnsafeSocketProtocol(const CallEvent &Call, const IdentifierInfo *ID = Call.getCalleeIdentifier(); if (!ID) return; - if (!ID->getName().equals("socket")) + if (ID->getName() != "socket") return; SourceLocation DomLoc = Call.getArgExpr(0)->getExprLoc(); StringRef DomName = C.getMacroNameOrSpelling(DomLoc); // Allow internal communication protocols. - bool SafeProtocol = DomName.equals("AF_SYSTEM") || - DomName.equals("AF_LOCAL") || DomName.equals("AF_UNIX") || - DomName.equals("AF_RESERVED_36"); + bool SafeProtocol = DomName == "AF_SYSTEM" || DomName == "AF_LOCAL" || + DomName == "AF_UNIX" || DomName == "AF_RESERVED_36"; if (SafeProtocol) return; diff --git a/clang/lib/StaticAnalyzer/Checkers/LLVMConventionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/LLVMConventionsChecker.cpp index fa51aa80216b4..1cb3848cfed2a 100644 --- a/clang/lib/StaticAnalyzer/Checkers/LLVMConventionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/LLVMConventionsChecker.cpp @@ -41,7 +41,7 @@ static bool InNamespace(const Decl *D, StringRef NS) { if (!ND) return false; const IdentifierInfo *II = ND->getIdentifier(); - if (!II || !II->getName().equals(NS)) + if (!II || II->getName() != NS) return false; return isa(ND->getDeclContext()); } diff --git a/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp index 882eb0236a189..f524c4c067c8c 100644 --- a/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp @@ -1159,7 +1159,7 @@ void EmptyLocalizationContextChecker::MethodCrawler::VisitObjCMessageExpr( } if (isAnyIdentifier(Result.getKind())) { - if (Result.getRawIdentifier().equals("nil")) { + if (Result.getRawIdentifier() == "nil") { reportEmptyContextError(ME); return; } diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp index ab89fb14046be..34af7fb131f5a 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp @@ -1727,7 +1727,7 @@ static std::optional getFreeWhenDoneArg(const ObjCMethodCall &Call) { // FIXME: We should not rely on fully-constrained symbols being folded. for (unsigned i = 1; i < S.getNumArgs(); ++i) - if (S.getNameForSlot(i).equals("freeWhenDone")) + if (S.getNameForSlot(i) == "freeWhenDone") return !Call.getArgSVal(i).isZeroConstant(); return std::nullopt; @@ -3255,7 +3255,7 @@ bool MallocChecker::mayFreeAnyEscapedMemoryOrIsModeledExplicitly( if (FirstSlot.starts_with("addPointer") || FirstSlot.starts_with("insertPointer") || FirstSlot.starts_with("replacePointer") || - FirstSlot.equals("valueWithPointer")) { + FirstSlot == "valueWithPointer") { return true; } diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersASTChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersASTChecker.cpp index 2b008d1c775a2..6978d81faf1c7 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersASTChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersASTChecker.cpp @@ -101,14 +101,14 @@ void WalkAST::VisitCallExpr(CallExpr *CE) { const Expr *Arg = nullptr; unsigned ArgNum; - if (Name.equals("CFArrayCreate") || Name.equals("CFSetCreate")) { + if (Name == "CFArrayCreate" || Name == "CFSetCreate") { if (CE->getNumArgs() != 4) return; ArgNum = 1; Arg = CE->getArg(ArgNum)->IgnoreParenCasts(); if (hasPointerToPointerSizedType(Arg)) return; - } else if (Name.equals("CFDictionaryCreate")) { + } else if (Name == "CFDictionaryCreate") { if (CE->getNumArgs() != 6) return; // Check first argument. diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp index 28e88245ca95a..4937af3b91c29 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp @@ -82,7 +82,7 @@ void ObjCContainersChecker::checkPostStmt(const CallExpr *CE, return; // Add array size information to the state. - if (Name.equals("CFArrayCreate")) { + if (Name == "CFArrayCreate") { if (CE->getNumArgs() < 3) return; // Note, we can visit the Create method in the post-visit because @@ -92,7 +92,7 @@ void ObjCContainersChecker::checkPostStmt(const CallExpr *CE, return; } - if (Name.equals("CFArrayGetCount")) { + if (Name == "CFArrayGetCount") { addSizeInfo(CE->getArg(0), CE, C); return; } @@ -105,7 +105,7 @@ void ObjCContainersChecker::checkPreStmt(const CallExpr *CE, return; // Check the array access. - if (Name.equals("CFArrayGetValueAtIndex")) { + if (Name == "CFArrayGetValueAtIndex") { ProgramStateRef State = C.getState(); // Retrieve the size. // Find out if we saw this array symbol before and have information about diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp index a7b6f6c1fb55c..d4e020f7a72a0 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp @@ -1143,7 +1143,7 @@ void StreamChecker::evalFscanf(const FnDescription *Desc, const CallEvent &Call, return; if (auto const *Callee = Call.getCalleeIdentifier(); - !Callee || !Callee->getName().equals("vfscanf")) { + !Callee || Callee->getName() != "vfscanf") { SmallVector EscArgs; for (auto EscArg : llvm::seq(2u, Call.getNumArgs())) EscArgs.push_back(EscArg); diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 3abfa4cbb295d..ad493587affa0 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -350,8 +350,11 @@ class TrivialFunctionAnalysisVisitor const auto &Name = safeGetName(Callee); if (Name == "WTFCrashWithInfo" || Name == "WTFBreakpointTrap" || - Name == "WTFReportAssertionFailure" || - Name == "compilerFenceForCrash" || Name.find("__builtin") == 0) + Name == "WTFReportAssertionFailure" || Name == "isMainThread" || + Name == "isMainThreadOrGCThread" || Name == "isMainRunLoop" || + Name == "isWebThread" || Name == "isUIThread" || + Name == "compilerFenceForCrash" || Name == "bitwise_cast" || + Name == "addressof" || Name.find("__builtin") == 0) return true; return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache); @@ -428,6 +431,8 @@ class TrivialFunctionAnalysisVisitor return TrivialFunctionAnalysis::isTrivialImpl(CE->getConstructor(), Cache); } + bool VisitCXXNewExpr(const CXXNewExpr *NE) { return VisitChildren(NE); } + bool VisitImplicitCastExpr(const ImplicitCastExpr *ICE) { return Visit(ICE->getSubExpr()); } diff --git a/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp b/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp index 113abcd4c2ab0..96464b30c078f 100644 --- a/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp +++ b/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp @@ -104,7 +104,7 @@ bool CheckerContext::isCLibraryFunction(const FunctionDecl *FD, return true; StringRef FName = II->getName(); - if (FName.equals(Name)) + if (FName == Name) return true; if (FName.starts_with("__inline") && FName.contains(Name)) diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index c5c3cdb47e92e..ffacf9cf1f782 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -293,7 +293,7 @@ void addTargetAndModeForProgramName(std::vector &CommandLine, ++Token) { StringRef TokenRef(*Token); ShouldAddTarget = ShouldAddTarget && !TokenRef.starts_with(TargetOPT) && - !TokenRef.equals(TargetOPTLegacy); + TokenRef != TargetOPTLegacy; ShouldAddMode = ShouldAddMode && !TokenRef.starts_with(DriverModeOPT); } if (ShouldAddMode) { diff --git a/clang/test/AST/Interp/builtin-align-cxx.cpp b/clang/test/AST/Interp/builtin-align-cxx.cpp index 62d73dba929b2..c4103953df026 100644 --- a/clang/test/AST/Interp/builtin-align-cxx.cpp +++ b/clang/test/AST/Interp/builtin-align-cxx.cpp @@ -2,19 +2,6 @@ // RUN: %clang_cc1 -triple=x86_64-unknown-unknown -std=c++11 %s -fsyntax-only -verify=expected,both -fexperimental-new-constant-interpreter // RUN: %clang_cc1 -triple=x86_64-unknown-unknown -std=c++11 %s -fsyntax-only -verify=ref,both - -/// This is just a copy of the one from test/SemaCXX/ with some of the -/// diagnostic output adapted. -/// Also, align32array has an initializer now, which means it's not just -/// a dummy pointer for us and we do actually have type information for it. -/// In the future, we need to retain type information for dummy pointers as -/// well, so here is a test that will break once we do that: -namespace { - _Alignas(32) char heh[4]; - static_assert(!__builtin_is_aligned(&heh[1], 4), ""); // expected-error {{failed}} -} - - // Check that we don't crash when using dependent types in __builtin_align: template void *c(void *d) { // both-note{{candidate template ignored}} @@ -177,7 +164,7 @@ static_assert(wrap_align_up(static_cast(1), const_value(1 << 21)), ""); // // both-note@-1{{in instantiation of function template specialization 'wrap_align_up' requested here}} // Check constant evaluation for pointers: -_Alignas(32) char align32array[128] = {}; +_Alignas(32) char align32array[128]; static_assert(&align32array[0] == &align32array[0], ""); // __builtin_align_up/down can be constant evaluated as a no-op for values // that are known to have greater alignment: diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c index 207da5fe81260..2c675f4418ef3 100644 --- a/clang/test/AST/Interp/c.c +++ b/clang/test/AST/Interp/c.c @@ -270,3 +270,6 @@ int test3(void) { a[0] = test3; // all-error {{incompatible pointer to integer conversion assigning to 'int' from 'int (void)'}} return 0; } +/// This tests that we have full type info, even for values we cannot read. +int dummyarray[5]; +_Static_assert(&dummyarray[0] < &dummyarray[1], ""); // pedantic-warning {{GNU extension}} diff --git a/clang/test/AST/ast-dump-default-init-json.cpp b/clang/test/AST/ast-dump-default-init-json.cpp index 1058b4e3ea4d9..f4949a9c9eedf 100644 --- a/clang/test/AST/ast-dump-default-init-json.cpp +++ b/clang/test/AST/ast-dump-default-init-json.cpp @@ -789,10 +789,10 @@ void test() { // CHECK-NEXT: "valueCategory": "lvalue", // CHECK-NEXT: "extendingDecl": { // CHECK-NEXT: "id": "0x{{.*}}", -// CHECK-NEXT: "kind": "FieldDecl", -// CHECK-NEXT: "name": "a", +// CHECK-NEXT: "kind": "VarDecl", +// CHECK-NEXT: "name": "b", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "const A &" +// CHECK-NEXT: "qualType": "B" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "storageDuration": "automatic", diff --git a/clang/test/AST/ast-dump-default-init.cpp b/clang/test/AST/ast-dump-default-init.cpp index 15b29f04bf21b..26864fbf15424 100644 --- a/clang/test/AST/ast-dump-default-init.cpp +++ b/clang/test/AST/ast-dump-default-init.cpp @@ -13,7 +13,7 @@ void test() { } // CHECK: -CXXDefaultInitExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue has rewritten init // CHECK-NEXT: `-ExprWithCleanups 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue -// CHECK-NEXT: `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Field 0x{{[^ ]*}} 'a' 'const A &' +// CHECK-NEXT: `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Var 0x{{[^ ]*}} 'b' 'B' // CHECK-NEXT: `-ImplicitCastExpr 0x{{[^ ]*}} <{{.*}}> 'const A' // CHECK-NEXT: `-CXXFunctionalCastExpr 0x{{[^ ]*}} <{{.*}}> 'A' functional cast to A // CHECK-NEXT: `-InitListExpr 0x{{[^ ]*}} <{{.*}}> 'A' diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp index 6ca7677511d73..073f3252160ee 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp @@ -59,6 +59,18 @@ void WTFCrashWithInfo(int line, const char* file, const char* function, int coun WTFCrashWithInfoImpl(line, file, function, counter, wtfCrashArg(reason)); } +template +ToType bitwise_cast(FromType from); + +template +T* addressof(T& arg); + +bool isMainThread(); +bool isMainThreadOrGCThread(); +bool isMainRunLoop(); +bool isWebThread(); +bool isUIThread(); + enum class Flags : unsigned short { Flag1 = 1 << 0, Flag2 = 1 << 1, @@ -234,6 +246,16 @@ class RefCounted { void trivial38() { v++; if (__builtin_expect(!!(number), 1)) (*number)++; } int trivial39() { return -v; } int trivial40() { return v << 2; } + unsigned trivial41() { v = ++s_v; return v; } + unsigned trivial42() { return bitwise_cast(nullptr); } + Number* trivial43() { return addressof(*number); } + Number* trivial44() { return new Number(1); } + ComplexNumber* trivial45() { return new ComplexNumber(); } + void trivial46() { ASSERT(isMainThread()); } + void trivial47() { ASSERT(isMainThreadOrGCThread()); } + void trivial48() { ASSERT(isMainRunLoop()); } + void trivial49() { ASSERT(isWebThread()); } + void trivial50() { ASSERT(isUIThread()); } static RefCounted& singleton() { static RefCounted s_RefCounted; @@ -312,13 +334,17 @@ class RefCounted { void nonTrivial16() { complex++; } ComplexNumber nonTrivial17() { return complex << 2; } ComplexNumber nonTrivial18() { return +complex; } + ComplexNumber* nonTrivial19() { return new ComplexNumber(complex); } + static unsigned s_v; unsigned v { 0 }; Number* number { nullptr }; ComplexNumber complex; Enum enumValue { Enum::Value1 }; }; +unsigned RefCounted::s_v = 0; + RefCounted* refCountedObj(); void test() @@ -377,6 +403,16 @@ class UnrelatedClass { getFieldTrivial().trivial38(); // no-warning getFieldTrivial().trivial39(); // no-warning getFieldTrivial().trivial40(); // no-warning + getFieldTrivial().trivial41(); // no-warning + getFieldTrivial().trivial42(); // no-warning + getFieldTrivial().trivial43(); // no-warning + getFieldTrivial().trivial44(); // no-warning + getFieldTrivial().trivial45(); // no-warning + getFieldTrivial().trivial46(); // no-warning + getFieldTrivial().trivial47(); // no-warning + getFieldTrivial().trivial48(); // no-warning + getFieldTrivial().trivial49(); // no-warning + getFieldTrivial().trivial50(); // no-warning RefCounted::singleton().trivial18(); // no-warning RefCounted::singleton().someFunction(); // no-warning @@ -419,6 +455,8 @@ class UnrelatedClass { // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} getFieldTrivial().nonTrivial18(); // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} + getFieldTrivial().nonTrivial19(); + // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} } }; diff --git a/clang/test/Analysis/lifetime-extended-regions.cpp b/clang/test/Analysis/lifetime-extended-regions.cpp index 4e98bd4b0403e..4458ad294af7c 100644 --- a/clang/test/Analysis/lifetime-extended-regions.cpp +++ b/clang/test/Analysis/lifetime-extended-regions.cpp @@ -120,10 +120,11 @@ void aggregateWithReferences() { clang_analyzer_dump(viaReference); // expected-warning-re {{&lifetime_extended_object{RefAggregate, viaReference, S{{[0-9]+}}} }} clang_analyzer_dump(viaReference.rx); // expected-warning-re {{&lifetime_extended_object{int, viaReference, S{{[0-9]+}}} }} clang_analyzer_dump(viaReference.ry); // expected-warning-re {{&lifetime_extended_object{Composite, viaReference, S{{[0-9]+}}} }} - - // clang does not currently implement extending lifetime of object bound to reference members of aggregates, - // that are created from default member initializer (see `warn_unsupported_lifetime_extension` from `-Wdangling`) - RefAggregate defaultInitExtended{i}; // clang-bug does not extend `Composite` + + // FIXME: clang currently support extending lifetime of object bound to reference members of aggregates, + // that are created from default member initializer. But CFG and ExprEngine need to be updated to address this change. + // The following expect warning: {{&lifetime_extended_object{Composite, defaultInitExtended, S{{[0-9]+}}} }} + RefAggregate defaultInitExtended{i}; clang_analyzer_dump(defaultInitExtended.ry); // expected-warning {{Unknown }} } diff --git a/clang/test/CXX/drs/cwg16xx.cpp b/clang/test/CXX/drs/cwg16xx.cpp index cf6b45ceabf2c..82ef871939d2c 100644 --- a/clang/test/CXX/drs/cwg16xx.cpp +++ b/clang/test/CXX/drs/cwg16xx.cpp @@ -483,8 +483,6 @@ namespace cwg1696 { // cwg1696: 7 const A &a = A(); // #cwg1696-D1-a }; D1 d1 = {}; // #cwg1696-d1 - // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} - // since-cxx14-note@#cwg1696-D1-a {{initializing field 'a' with default member initializer}} struct D2 { const A &a = A(); // #cwg1696-D2-a diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp index 3a2248a1af551..9eb749153e57a 100644 --- a/clang/test/CXX/drs/cwg18xx.cpp +++ b/clang/test/CXX/drs/cwg18xx.cpp @@ -206,22 +206,78 @@ namespace cwg1814 { // cwg1814: yes #endif } -namespace cwg1815 { // cwg1815: no +namespace cwg1815 { // cwg1815: yes #if __cplusplus >= 201402L - // FIXME: needs codegen test - struct A { int &&r = 0; }; // #cwg1815-A + struct A { int &&r = 0; }; A a = {}; - // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} FIXME - // since-cxx14-note@#cwg1815-A {{initializing field 'r' with default member initializer}} struct B { int &&r = 0; }; // #cwg1815-B // since-cxx14-error@-1 {{reference member 'r' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} // since-cxx14-note@#cwg1815-B {{initializing field 'r' with default member initializer}} // since-cxx14-note@#cwg1815-b {{in implicit default constructor for 'cwg1815::B' first required here}} B b; // #cwg1815-b + +#if __cplusplus >= 201703L + struct C { const int &r = 0; }; + constexpr C c = {}; // OK, since cwg1815 + static_assert(c.r == 0); + + constexpr int f() { + A a = {}; // OK, since cwg1815 + return a.r; + } + static_assert(f() == 0); #endif +#endif +} + +namespace cwg1820 { // cwg1820: 3.5 +typedef int A; +typedef int cwg1820::A; +// expected-warning@-1 {{extra qualification on member 'A'}} +// expected-error@-2 {{typedef declarator cannot be qualified}} + +namespace B { +typedef int cwg1820::A; +// expected-error@-1 {{cannot define or redeclare 'A' here because namespace 'B' does not enclose namespace 'cwg1820'}} +// expected-error@-2 {{typedef declarator cannot be qualified}} +} + +class C1 { + typedef int cwg1820::A; + // expected-error@-1 {{non-friend class member 'A' cannot have a qualified name}} + // expected-error@-2 {{typedef declarator cannot be qualified}} +}; + +template +class C2 { + typedef int cwg1820::A; + // expected-error@-1 {{non-friend class member 'A' cannot have a qualified name}} + // expected-error@-2 {{typedef declarator cannot be qualified}} +}; + +void d1() { + typedef int cwg1820::A; + // expected-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}} + // expected-error@-2 {{typedef declarator cannot be qualified}} +} + +template +void d2() { + typedef int cwg1820::A; + // expected-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}} + // expected-error@-2 {{typedef declarator cannot be qualified}} } +#if __cplusplus >= 201103L +auto e = [] { + typedef int cwg1820::A; + // expected-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}} + // expected-error@-2 {{typedef declarator cannot be qualified}} +}; +#endif +} // namespace cwg1820 + namespace cwg1821 { // cwg1821: 2.9 struct A { template struct B { diff --git a/clang/test/CXX/special/class.temporary/p6.cpp b/clang/test/CXX/special/class.temporary/p6.cpp index 5554363cc69ab..a6d2adfd1fd2c 100644 --- a/clang/test/CXX/special/class.temporary/p6.cpp +++ b/clang/test/CXX/special/class.temporary/p6.cpp @@ -269,6 +269,40 @@ void init_capture_init_list() { // CHECK: } } +void check_dr1815() { // dr1815: yes +#if __cplusplus >= 201402L + + struct A { + int &&r = 0; + ~A() {} + }; + + struct B { + A &&a = A{}; + ~B() {} + }; + B a = {}; + + // CHECK: call {{.*}}block_scope_begin_function + extern void block_scope_begin_function(); + extern void block_scope_end_function(); + block_scope_begin_function(); + { + // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev + // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev + B b = {}; + } + // CHECK: call {{.*}}block_scope_end_function + block_scope_end_function(); + + // CHECK: call {{.*}}some_other_function + extern void some_other_function(); + some_other_function(); + // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev + // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev +#endif +} + namespace P2718R0 { namespace basic { template using T2 = std::list; diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 998d5f37da69b..25f8f66bc3213 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -436,12 +436,14 @@ // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=ppx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PPX %s // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=ndd %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NDD %s // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=ccmp %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CCMP %s +// RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=nf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NF %s // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=cf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CF %s // EGPR: "-target-feature" "+egpr" // PUSH2POP2: "-target-feature" "+push2pop2" // PPX: "-target-feature" "+ppx" // NDD: "-target-feature" "+ndd" // CCMP: "-target-feature" "+ccmp" +// NF: "-target-feature" "+nf" // CF: "-target-feature" "+cf" // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr,ndd %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR-NDD %s diff --git a/clang/test/Modules/implicit-module-no-timestamp.cpp b/clang/test/Modules/implicit-module-no-timestamp.cpp index 1b681a610bab2..50767b4a11468 100644 --- a/clang/test/Modules/implicit-module-no-timestamp.cpp +++ b/clang/test/Modules/implicit-module-no-timestamp.cpp @@ -1,4 +1,3 @@ -// UNSUPPORTED: system-windows // RUN: rm -rf %t // RUN: split-file %s %t // RUN: cd %t diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 5602c59158fe5..57104c9e7a500 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -802,6 +802,7 @@ // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ppx -x c -E -dM -o - %s | FileCheck --check-prefix=PPX %s // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ndd -x c -E -dM -o - %s | FileCheck --check-prefix=NDD %s // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ccmp -x c -E -dM -o - %s | FileCheck --check-prefix=CCMP %s +// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=nf -x c -E -dM -o - %s | FileCheck --check-prefix=NF %s // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,APXF %s // APXF: #define __APX_F__ 1 @@ -809,5 +810,6 @@ // CF: #define __CF__ 1 // EGPR: #define __EGPR__ 1 // NDD: #define __NDD__ 1 +// NF: #define __NF__ 1 // PPX: #define __PPX__ 1 // PUSH2POP2: #define __PUSH2POP2__ 1 diff --git a/clang/test/SemaCXX/constexpr-default-arg.cpp b/clang/test/SemaCXX/constexpr-default-arg.cpp index ec9b2927880bd..901123bfb359f 100644 --- a/clang/test/SemaCXX/constexpr-default-arg.cpp +++ b/clang/test/SemaCXX/constexpr-default-arg.cpp @@ -32,8 +32,8 @@ void test_default_arg2() { } // Check that multiple CXXDefaultInitExprs don't cause an assertion failure. -struct A { int &&r = 0; }; // expected-note 2{{default member initializer}} +struct A { int &&r = 0; }; struct B { A x, y; }; -B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}} +B b = {}; // expected-no-diagnostics } diff --git a/clang/test/SemaCXX/eval-crashes.cpp b/clang/test/SemaCXX/eval-crashes.cpp index 017df977b26b7..a06f60f71e9c7 100644 --- a/clang/test/SemaCXX/eval-crashes.cpp +++ b/clang/test/SemaCXX/eval-crashes.cpp @@ -25,11 +25,9 @@ namespace pr33140_0b { } namespace pr33140_2 { - // FIXME: The declaration of 'b' below should lifetime-extend two int - // temporaries. - struct A { int &&r = 0; }; // expected-note 2{{initializing field 'r' with default member initializer}} + struct A { int &&r = 0; }; struct B { A x, y; }; - B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}} + B b = {}; } namespace pr33140_3 { diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp index ec144d4f44ba8..e981ea8d5ecfb 100644 --- a/clang/test/SemaTemplate/ctad.cpp +++ b/clang/test/SemaTemplate/ctad.cpp @@ -54,3 +54,18 @@ template struct Y { Y(T); }; template struct Y ; Y y(1); } + +namespace NoCrashOnGettingDefaultArgLoc { +template +class A { + A(int = 1); // expected-note {{candidate template ignored: couldn't infer template argumen}} +}; +class C : A { + using A::A; +}; +template +class D : C { // expected-note {{candidate function template not viable: requires 1 argument}} + using C::C; +}; +D abc; // expected-error {{no viable constructor or deduction guide}} +} diff --git a/clang/tools/diagtool/ShowEnabledWarnings.cpp b/clang/tools/diagtool/ShowEnabledWarnings.cpp index 285efe6ae05b3..66a295db054c3 100644 --- a/clang/tools/diagtool/ShowEnabledWarnings.cpp +++ b/clang/tools/diagtool/ShowEnabledWarnings.cpp @@ -90,11 +90,11 @@ int ShowEnabledWarnings::run(unsigned int argc, char **argv, raw_ostream &Out) { bool ShouldShowLevels = true; if (argc > 0) { StringRef FirstArg(*argv); - if (FirstArg.equals("--no-levels")) { + if (FirstArg == "--no-levels") { ShouldShowLevels = false; --argc; ++argv; - } else if (FirstArg.equals("--levels")) { + } else if (FirstArg == "--levels") { ShouldShowLevels = true; --argc; ++argv; diff --git a/clang/tools/diagtool/TreeView.cpp b/clang/tools/diagtool/TreeView.cpp index 00d1097b5fbfd..8d1ce14b0f520 100644 --- a/clang/tools/diagtool/TreeView.cpp +++ b/clang/tools/diagtool/TreeView.cpp @@ -144,7 +144,7 @@ int TreeView::run(unsigned int argc, char **argv, llvm::raw_ostream &out) { bool Internal = false; if (argc > 0) { StringRef FirstArg(*argv); - if (FirstArg.equals("--internal")) { + if (FirstArg == "--internal") { Internal = true; --argc; ++argv; diff --git a/clang/unittests/AST/Interp/Descriptor.cpp b/clang/unittests/AST/Interp/Descriptor.cpp index 4ea0fbc285a98..053d579ea3919 100644 --- a/clang/unittests/AST/Interp/Descriptor.cpp +++ b/clang/unittests/AST/Interp/Descriptor.cpp @@ -115,7 +115,7 @@ TEST(Descriptor, Primitives) { // Check pointer stuff. // Global variables have an inline descriptor. - ASSERT_FALSE(GlobalPtr.isRoot()); + ASSERT_TRUE(GlobalPtr.isRoot()); ASSERT_TRUE(GlobalPtr.isLive()); ASSERT_FALSE(GlobalPtr.isZero()); ASSERT_FALSE(GlobalPtr.isField()); diff --git a/clang/unittests/CodeGen/IRMatchers.h b/clang/unittests/CodeGen/IRMatchers.h index 47e4204980362..3572a317f07ac 100644 --- a/clang/unittests/CodeGen/IRMatchers.h +++ b/clang/unittests/CodeGen/IRMatchers.h @@ -317,7 +317,7 @@ class NameMetaMatcher : public EntityMatcher { NameMetaMatcher(StringRef N) : Name(N) {} bool matchEntity(const Metadata &M, MatcherContext &C) override { if (auto *MDS = dyn_cast(&M)) - return MDS->getString().equals(Name); + return MDS->getString() == Name; return false; } }; diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp index ada47549fd0b3..792d8f3c3a982 100644 --- a/clang/unittests/Format/QualifierFixerTest.cpp +++ b/clang/unittests/Format/QualifierFixerTest.cpp @@ -27,15 +27,11 @@ namespace { EXPECT_EQ(VALUE, Style.FIELD) << "Unexpected value after parsing!" class QualifierFixerTest : public FormatTestBase { -public: - QualifierFixerTest() : LangOpts(getFormattingLangOpts()) {} - protected: TokenList annotate(llvm::StringRef Code, const FormatStyle &Style = getLLVMStyle()) { return TestLexer(Allocator, Buffers, Style).annotate(Code); } - LangOptions LangOpts; llvm::SpecificBumpPtrAllocator Allocator; std::vector> Buffers; }; @@ -1064,75 +1060,65 @@ TEST_F(QualifierFixerTest, IsQualifierType) { ASSERT_EQ(Tokens.size(), 11u) << Tokens; EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[0], ConfiguredTokens, LangOpts)); + Tokens[0], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[1], ConfiguredTokens, LangOpts)); + Tokens[1], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[2], ConfiguredTokens, LangOpts)); + Tokens[2], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[3], ConfiguredTokens, LangOpts)); + Tokens[3], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[4], ConfiguredTokens, LangOpts)); + Tokens[4], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[5], ConfiguredTokens, LangOpts)); + Tokens[5], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[6], ConfiguredTokens, LangOpts)); + Tokens[6], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[7], ConfiguredTokens, LangOpts)); + Tokens[7], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[8], ConfiguredTokens, LangOpts)); + Tokens[8], ConfiguredTokens)); EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - Tokens[9], ConfiguredTokens, LangOpts)); - - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[0], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[1], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[2], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[3], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[4], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[5], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[6], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[7], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[8], LangOpts)); - EXPECT_TRUE( - LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[9], LangOpts)); + Tokens[9], ConfiguredTokens)); + + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[0])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[1])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[2])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[3])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[4])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[5])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[6])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[7])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[8])); + EXPECT_TRUE(LeftRightQualifierAlignmentFixer::isQualifierOrType(Tokens[9])); auto NotTokens = annotate("for while do Foo Bar "); ASSERT_EQ(NotTokens.size(), 6u) << Tokens; EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - NotTokens[0], ConfiguredTokens, LangOpts)); + NotTokens[0], ConfiguredTokens)); EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - NotTokens[1], ConfiguredTokens, LangOpts)); + NotTokens[1], ConfiguredTokens)); EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - NotTokens[2], ConfiguredTokens, LangOpts)); + NotTokens[2], ConfiguredTokens)); EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - NotTokens[3], ConfiguredTokens, LangOpts)); + NotTokens[3], ConfiguredTokens)); EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - NotTokens[4], ConfiguredTokens, LangOpts)); + NotTokens[4], ConfiguredTokens)); EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isConfiguredQualifierOrType( - NotTokens[5], ConfiguredTokens, LangOpts)); - - EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[0], - LangOpts)); - EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[1], - LangOpts)); - EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[2], - LangOpts)); - EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[3], - LangOpts)); - EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[4], - LangOpts)); - EXPECT_FALSE(LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[5], - LangOpts)); + NotTokens[5], ConfiguredTokens)); + + EXPECT_FALSE( + LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[0])); + EXPECT_FALSE( + LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[1])); + EXPECT_FALSE( + LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[2])); + EXPECT_FALSE( + LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[3])); + EXPECT_FALSE( + LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[4])); + EXPECT_FALSE( + LeftRightQualifierAlignmentFixer::isQualifierOrType(NotTokens[5])); } TEST_F(QualifierFixerTest, IsMacro) { diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 992dc61d65e42..92fdcf5556ede 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -10698,7 +10698,7 @@

C++ defect report implementation status

1815 CD4 Lifetime extension in aggregate initialization - No + Clang 19 1816 @@ -10728,7 +10728,7 @@

C++ defect report implementation status

1820 CD6 Qualified typedef names - Unknown + Clang 3.5 1821 diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py index 774c4eaf4d976..619634578dfe6 100644 --- a/cross-project-tests/lit.cfg.py +++ b/cross-project-tests/lit.cfg.py @@ -84,7 +84,13 @@ def get_required_attr(config, attr_name): # use_clang() and use_lld() respectively, so set them to "", if needed. if not hasattr(config, "clang_src_dir"): config.clang_src_dir = "" -llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects)) +# Facebook T92898286 +should_test_bolt = get_required_attr(config, "llvm_test_bolt") +if should_test_bolt: + llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"]) +else: + llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects)) +# End Facebook T92898286 if not hasattr(config, "lld_src_dir"): config.lld_src_dir = "" @@ -293,3 +299,9 @@ def get_clang_default_dwarf_version_string(triple): # Allow 'REQUIRES: XXX-registered-target' in tests. for arch in config.targets_to_build: config.available_features.add(arch.lower() + "-registered-target") + +# Facebook T92898286 +# Ensure the user's PYTHONPATH is included. +if "PYTHONPATH" in os.environ: + config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"] +# End Facebook T92898286 diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in index 39458dfc79afd..2d53cd377f033 100644 --- a/cross-project-tests/lit.site.cfg.py.in +++ b/cross-project-tests/lit.site.cfg.py.in @@ -21,6 +21,10 @@ config.mlir_src_root = "@MLIR_SOURCE_DIR@" config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" +# Facebook T92898286 +config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@") +# End Facebook T92898286 + import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 96d7b4037e106..01e9c247560ca 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -446,6 +446,7 @@ set(files __ios/fpos.h __iterator/access.h __iterator/advance.h + __iterator/aliasing_iterator.h __iterator/back_insert_iterator.h __iterator/bounded_iter.h __iterator/common_iterator.h diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index c2b3f8938f711..632bec02406a4 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -16,6 +16,7 @@ #include <__algorithm/unwrap_iter.h> #include <__config> #include <__functional/identity.h> +#include <__iterator/aliasing_iterator.h> #include <__type_traits/desugars_to.h> #include <__type_traits/invoke.h> #include <__type_traits/is_constant_evaluated.h> @@ -55,18 +56,13 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro #if _LIBCPP_VECTORIZE_ALGORITHMS -template ::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> && - __is_identity<_Proj1>::value && __is_identity<_Proj2>::value, - int> = 0> -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*> -__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter> +__mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { + using __value_type = __iter_value_type<_Iter>; constexpr size_t __unroll_count = 4; - constexpr size_t __vec_size = __native_vector_size<_Tp>; - using __vec = __simd_vector<_Tp, __vec_size>; + constexpr size_t __vec_size = __native_vector_size<__value_type>; + using __vec = __simd_vector<__value_type, __vec_size>; if (!__libcpp_is_constant_evaluated()) { auto __orig_first1 = __first1; @@ -116,9 +112,41 @@ __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __ } // else loop over the elements individually } - return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2); + __equal_to __pred; + __identity __proj; + return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj, __proj); +} + +template ::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> && + __is_identity<_Proj1>::value && __is_identity<_Proj2>::value, + int> = 0> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*> +__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred&, _Proj1&, _Proj2&) { + return std::__mismatch_vectorized(__first1, __last1, __first2); } +template ::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> && + __is_identity<_Proj1>::value && __is_identity<_Proj2>::value && + __can_map_to_integer_v<_Tp> && __libcpp_is_trivially_equality_comparable<_Tp, _Tp>::value, + int> = 0> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*> +__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { + if (__libcpp_is_constant_evaluated()) { + return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2); + } else { + using _Iter = __aliasing_iterator<_Tp*, __get_as_integer_type_t<_Tp>>; + auto __ret = std::__mismatch_vectorized(_Iter(__first1), _Iter(__last1), _Iter(__first2)); + return {__ret.first.__base(), __ret.second.__base()}; + } +} #endif // _LIBCPP_VECTORIZE_ALGORITHMS template diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h index 8d540ae2cce88..71d65e8f4afb5 100644 --- a/libcxx/include/__algorithm/simd_utils.h +++ b/libcxx/include/__algorithm/simd_utils.h @@ -43,6 +43,34 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD +template +inline constexpr bool __can_map_to_integer_v = + sizeof(_Tp) == alignof(_Tp) && (sizeof(_Tp) == 1 || sizeof(_Tp) == 2 || sizeof(_Tp) == 4 || sizeof(_Tp) == 8); + +template +struct __get_as_integer_type_impl; + +template <> +struct __get_as_integer_type_impl<1> { + using type = uint8_t; +}; + +template <> +struct __get_as_integer_type_impl<2> { + using type = uint16_t; +}; +template <> +struct __get_as_integer_type_impl<4> { + using type = uint32_t; +}; +template <> +struct __get_as_integer_type_impl<8> { + using type = uint64_t; +}; + +template +using __get_as_integer_type_t = typename __get_as_integer_type_impl::type; + // This isn't specialized for 64 byte vectors on purpose. They have the potential to significantly reduce performance // in mixed simd/non-simd workloads and don't provide any performance improvement for currently vectorized algorithms // as far as benchmarks are concerned. @@ -80,10 +108,10 @@ template using __simd_vector_underlying_type_t = decltype(std::__simd_vector_underlying_type_impl(_VecT{})); // This isn't inlined without always_inline when loading chars. -template -_LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(const _Tp* __ptr) noexcept { +template +_LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept { return [=](index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept { - return _VecT{__ptr[_Indices]...}; + return _VecT{__iter[_Indices]...}; }(make_index_sequence<__simd_vector_size_v<_VecT>>{}); } diff --git a/libcxx/include/__iterator/aliasing_iterator.h b/libcxx/include/__iterator/aliasing_iterator.h new file mode 100644 index 0000000000000..94ba577078b5e --- /dev/null +++ b/libcxx/include/__iterator/aliasing_iterator.h @@ -0,0 +1,127 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ITERATOR_ALIASING_ITERATOR_H +#define _LIBCPP___ITERATOR_ALIASING_ITERATOR_H + +#include <__config> +#include <__iterator/iterator_traits.h> +#include <__memory/pointer_traits.h> +#include <__type_traits/is_trivial.h> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +// This iterator wrapper is used to type-pun an iterator to return a different type. This is done without UB by not +// actually punning the type, but instead inspecting the object representation of the base type and copying that into +// an instance of the alias type. For that reason the alias type has to be trivial. The alias is returned as a prvalue +// when derferencing the iterator, since it is temporary storage. This wrapper is used to vectorize some algorithms. + +_LIBCPP_BEGIN_NAMESPACE_STD + +template +struct __aliasing_iterator_wrapper { + class __iterator { + _BaseIter __base_ = nullptr; + + using __iter_traits = iterator_traits<_BaseIter>; + using __base_value_type = typename __iter_traits::value_type; + + static_assert(__has_random_access_iterator_category<_BaseIter>::value, + "The base iterator has to be a random access iterator!"); + + public: + using iterator_category = random_access_iterator_tag; + using value_type = _Alias; + using difference_type = ptrdiff_t; + using reference = value_type&; + using pointer = value_type*; + + static_assert(is_trivial::value); + static_assert(sizeof(__base_value_type) == sizeof(value_type)); + + _LIBCPP_HIDE_FROM_ABI __iterator() = default; + _LIBCPP_HIDE_FROM_ABI __iterator(_BaseIter __base) _NOEXCEPT : __base_(__base) {} + + _LIBCPP_HIDE_FROM_ABI __iterator& operator++() _NOEXCEPT { + ++__base_; + return *this; + } + + _LIBCPP_HIDE_FROM_ABI __iterator operator++(int) _NOEXCEPT { + __iterator __tmp(*this); + ++__base_; + return __tmp; + } + + _LIBCPP_HIDE_FROM_ABI __iterator& operator--() _NOEXCEPT { + --__base_; + return *this; + } + + _LIBCPP_HIDE_FROM_ABI __iterator operator--(int) _NOEXCEPT { + __iterator __tmp(*this); + --__base_; + return __tmp; + } + + _LIBCPP_HIDE_FROM_ABI friend __iterator operator+(__iterator __iter, difference_type __n) _NOEXCEPT { + return __iterator(__iter.__base_ + __n); + } + + _LIBCPP_HIDE_FROM_ABI friend __iterator operator+(difference_type __n, __iterator __iter) _NOEXCEPT { + return __iterator(__n + __iter.__base_); + } + + _LIBCPP_HIDE_FROM_ABI __iterator& operator+=(difference_type __n) _NOEXCEPT { + __base_ += __n; + return *this; + } + + _LIBCPP_HIDE_FROM_ABI friend __iterator operator-(__iterator __iter, difference_type __n) _NOEXCEPT { + return __iterator(__iter.__base_ - __n); + } + + _LIBCPP_HIDE_FROM_ABI friend difference_type operator-(__iterator __lhs, __iterator __rhs) _NOEXCEPT { + return __lhs.__base_ - __rhs.__base_; + } + + _LIBCPP_HIDE_FROM_ABI __iterator& operator-=(difference_type __n) _NOEXCEPT { + __base_ -= __n; + return *this; + } + + _LIBCPP_HIDE_FROM_ABI _BaseIter __base() const _NOEXCEPT { return __base_; } + + _LIBCPP_HIDE_FROM_ABI _Alias operator*() const _NOEXCEPT { + _Alias __val; + __builtin_memcpy(&__val, std::__to_address(__base_), sizeof(value_type)); + return __val; + } + + _LIBCPP_HIDE_FROM_ABI value_type operator[](difference_type __n) const _NOEXCEPT { return *(*this + __n); } + + _LIBCPP_HIDE_FROM_ABI friend bool operator==(const __iterator& __lhs, const __iterator& __rhs) _NOEXCEPT { + return __lhs.__base_ == __rhs.__base_; + } + + _LIBCPP_HIDE_FROM_ABI friend bool operator!=(const __iterator& __lhs, const __iterator& __rhs) _NOEXCEPT { + return __lhs.__base_ != __rhs.__base_; + } + }; +}; + +// This is required to avoid ADL instantiations on _BaseT +template +using __aliasing_iterator = typename __aliasing_iterator_wrapper<_BaseT, _Alias>::__iterator; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___ITERATOR_ALIASING_ITERATOR_H diff --git a/libcxx/include/__type_traits/is_equality_comparable.h b/libcxx/include/__type_traits/is_equality_comparable.h index d4142218b641a..4397f743e5ee9 100644 --- a/libcxx/include/__type_traits/is_equality_comparable.h +++ b/libcxx/include/__type_traits/is_equality_comparable.h @@ -44,6 +44,8 @@ struct __is_equality_comparable<_Tp, _Up, __void_t() // pointers that don't have the same type (ignoring cv-qualifiers): pointers to virtual bases are equality comparable, // but don't have the same bit-pattern. An exception to this is comparing to a void-pointer. There the bit-pattern is // always compared. +// objects with padding bytes: since objects with padding bytes may compare equal, even though their object +// representation may not be equivalent. template struct __libcpp_is_trivially_equality_comparable_impl : false_type {}; diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 0ee0cdce61bc0..70dac2f19846b 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -700,6 +700,7 @@ module std_private_algorithm_minmax_element [system module std_private_algorithm_mismatch [system] { header "__algorithm/mismatch.h" export std_private_algorithm_simd_utils + export std_private_iterator_aliasing_iterator } module std_private_algorithm_move [system] { header "__algorithm/move.h" } module std_private_algorithm_move_backward [system] { header "__algorithm/move_backward.h" } @@ -1390,6 +1391,7 @@ module std_private_iosfwd_streambuf_fwd [system] { header "__fwd/streambuf.h" } module std_private_iterator_access [system] { header "__iterator/access.h" } module std_private_iterator_advance [system] { header "__iterator/advance.h" } +module std_private_iterator_aliasing_iterator [system] { header "__iterator/aliasing_iterator.h" } module std_private_iterator_back_insert_iterator [system] { header "__iterator/back_insert_iterator.h" } module std_private_iterator_bounded_iter [system] { header "__iterator/bounded_iter.h" } module std_private_iterator_common_iterator [system] { header "__iterator/common_iterator.h" } diff --git a/libcxx/test/libcxx/iterators/aliasing_iterator.pass.cpp b/libcxx/test/libcxx/iterators/aliasing_iterator.pass.cpp new file mode 100644 index 0000000000000..60587d5bfe5d7 --- /dev/null +++ b/libcxx/test/libcxx/iterators/aliasing_iterator.pass.cpp @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// ADDITIONAL_COMPILE_FLAGS(clang): -Wprivate-header + +#include <__iterator/aliasing_iterator.h> +#include + +struct NonTrivial { + int i_; + + NonTrivial(int i) : i_(i) {} + NonTrivial(const NonTrivial& other) : i_(other.i_) {} + + NonTrivial& operator=(const NonTrivial& other) { + i_ = other.i_; + return *this; + } + + ~NonTrivial() {} +}; + +int main(int, char**) { + { + NonTrivial arr[] = {1, 2, 3, 4}; + std::__aliasing_iterator iter(arr); + + assert(*iter == 1); + assert(iter[0] == 1); + assert(iter[1] == 2); + ++iter; + assert(*iter == 2); + assert(iter[-1] == 1); + assert(iter.__base() == arr + 1); + assert(iter == iter); + assert(iter != (iter + 1)); + } + + return 0; +} diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp index eb5f7cacdde34..dd37555ffcce5 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp @@ -66,14 +66,27 @@ TEST_CONSTEXPR_CXX20 void check(Container1 lhs, Container2 rhs, size_t offset) { #endif } -struct NonTrivial { +// Compares modulo 4 to make sure we only forward to the vectorized version if we are trivially equality comparable +struct NonTrivialMod4Comp { int i_; - TEST_CONSTEXPR_CXX20 NonTrivial(int i) : i_(i) {} - TEST_CONSTEXPR_CXX20 NonTrivial(NonTrivial&& other) : i_(other.i_) { other.i_ = 0; } + TEST_CONSTEXPR_CXX20 NonTrivialMod4Comp(int i) : i_(i) {} + TEST_CONSTEXPR_CXX20 NonTrivialMod4Comp(NonTrivialMod4Comp&& other) : i_(other.i_) { other.i_ = 0; } - TEST_CONSTEXPR_CXX20 friend bool operator==(const NonTrivial& lhs, const NonTrivial& rhs) { return lhs.i_ == rhs.i_; } + TEST_CONSTEXPR_CXX20 friend bool operator==(const NonTrivialMod4Comp& lhs, const NonTrivialMod4Comp& rhs) { + return lhs.i_ % 4 == rhs.i_ % 4; + } +}; + +#if TEST_STD_VER >= 20 +struct TriviallyEqualityComparable { + int i_; + + TEST_CONSTEXPR_CXX20 TriviallyEqualityComparable(int i) : i_(i) {} + + TEST_CONSTEXPR_CXX20 friend bool operator==(TriviallyEqualityComparable, TriviallyEqualityComparable) = default; }; +#endif // TEST_STD_VER >= 20 struct ModTwoComp { TEST_CONSTEXPR_CXX20 bool operator()(int lhs, int rhs) { return lhs % 2 == rhs % 2; } @@ -136,16 +149,30 @@ TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::cpp17_input_iterator_list(), Test()); { // use a non-integer type to also test the general case - all elements match - std::array lhs = {1, 2, 3, 4, 5, 6, 7, 8}; - std::array rhs = {1, 2, 3, 4, 5, 6, 7, 8}; - check(std::move(lhs), std::move(rhs), 8); + std::array lhs = {1, 2, 3, 4, 5, 6, 7, 8}; + std::array rhs = {1, 2, 3, 4, 1, 6, 7, 8}; + check(std::move(lhs), std::move(rhs), 8); } { // use a non-integer type to also test the general case - not all elements match - std::array lhs = {1, 2, 3, 4, 7, 6, 7, 8}; - std::array rhs = {1, 2, 3, 4, 5, 6, 7, 8}; - check(std::move(lhs), std::move(rhs), 4); + std::array lhs = {1, 2, 3, 4, 7, 6, 7, 8}; + std::array rhs = {1, 2, 3, 4, 5, 6, 7, 8}; + check(std::move(lhs), std::move(rhs), 4); + } + +#if TEST_STD_VER >= 20 + { // trivially equality comparable class type to test forwarding to the vectorized version - all elements match + std::array lhs = {1, 2, 3, 4, 5, 6, 7, 8}; + std::array rhs = {1, 2, 3, 4, 5, 6, 7, 8}; + check(std::move(lhs), std::move(rhs), 8); + } + + { // trivially equality comparable class type to test forwarding to the vectorized version - not all elements match + std::array lhs = {1, 2, 3, 4, 7, 6, 7, 8}; + std::array rhs = {1, 2, 3, 4, 5, 6, 7, 8}; + check(std::move(lhs), std::move(rhs), 4); } +#endif // TEST_STD_VER >= 20 return true; } diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index d934349fe3ca3..d4a62c51458cc 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -248,6 +248,11 @@ def delete_module_cache(path): if is_configured("lldb_framework_dir"): dotest_cmd += ["--framework", config.lldb_framework_dir] +# Facebook T92898286 +if is_configured("llvm_test_bolt"): + dotest_cmd += ["-E", '"--post-link-optimize"'] +# End Facebook T92898286 + if ( "lldb-repro-capture" in config.available_features or "lldb-repro-replay" in config.available_features diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in index 8b2d09ae41cd2..602f45759e48f 100644 --- a/lldb/test/API/lit.site.cfg.py.in +++ b/lldb/test/API/lit.site.cfg.py.in @@ -1,5 +1,9 @@ @LIT_SITE_CFG_IN_HEADER@ +#Facebook T92898286 +import lit.util +#End Facebook T92898286 + config.llvm_src_root = "@LLVM_SOURCE_DIR@" config.llvm_obj_root = "@LLVM_BINARY_DIR@" config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@") @@ -39,6 +43,10 @@ config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@" config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api") config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api") +# Facebook T92898286 +config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@") +# End Facebook T92898286 + # Plugins lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@' if lldb_build_intel_pt == '1': diff --git a/lldb/test/API/macosx/rosetta/TestRosetta.py b/lldb/test/API/macosx/rosetta/TestRosetta.py index ce40de475ef16..a812f558a8fc9 100644 --- a/lldb/test/API/macosx/rosetta/TestRosetta.py +++ b/lldb/test/API/macosx/rosetta/TestRosetta.py @@ -40,6 +40,7 @@ class TestRosetta(TestBase): NO_DEBUG_INFO_TESTCASE = True @skipUnlessAppleSilicon + @skipIfLLVMTargetMissing("X86") @skipIfDarwinEmbedded def test_rosetta(self): """There can be many tests in a test case - describe this test here.""" diff --git a/lldb/test/API/macosx/universal64/TestUniversal64.py b/lldb/test/API/macosx/universal64/TestUniversal64.py index 98661443086ef..893ff14d81138 100644 --- a/lldb/test/API/macosx/universal64/TestUniversal64.py +++ b/lldb/test/API/macosx/universal64/TestUniversal64.py @@ -17,6 +17,7 @@ def do_test(self): # actually launch them here. # The Makefile manually invokes clang. + @skipIfLLVMTargetMissing("X86") @skipIfAsan @skipUnlessDarwin @skipIfDarwinEmbedded @@ -26,6 +27,7 @@ def test_universal64_executable(self): self.do_test() # The Makefile manually invokes clang. + @skipIfLLVMTargetMissing("X86") @skipIfAsan @skipUnlessDarwin @skipIfDarwinEmbedded diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py index 255955fc70d8c..7b7be06643166 100644 --- a/lldb/test/Shell/helper/toolchain.py +++ b/lldb/test/Shell/helper/toolchain.py @@ -165,6 +165,11 @@ def use_support_substitutions(config): if config.cmake_sysroot: host_flags += ["--sysroot={}".format(config.cmake_sysroot)] + # Facebook T92898286 + if config.llvm_test_bolt: + host_flags += ["--post-link-optimize"] + # End Facebook T92898286 + host_flags = " ".join(host_flags) config.substitutions.append(("%clang_host", "%clang " + host_flags)) config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags)) diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in index b69e7bce1bc0b..fe8323734b7db 100644 --- a/lldb/test/Shell/lit.site.cfg.py.in +++ b/lldb/test/Shell/lit.site.cfg.py.in @@ -1,5 +1,10 @@ @LIT_SITE_CFG_IN_HEADER@ +#Facebook T92898286 +import lit.util +#End Facebook T92898286 + + config.llvm_src_root = "@LLVM_SOURCE_DIR@" config.llvm_obj_root = "@LLVM_BINARY_DIR@" config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@") @@ -31,6 +36,10 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell") config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell") +# Facebook T92898286 +config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@") +# End Facebook T92898286 + import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index c06e661573ed4..b699c551c3e64 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -709,6 +709,10 @@ set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH option(LLVM_USE_SPLIT_DWARF "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF) +# Facebook T92898286 +option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF) +# End Facebook T92898286 + # Define an option controlling whether we should build for 32-bit on 64-bit # platforms, where supported. if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX")) diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index e8d03f806715f..847d8103e6811 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -119,8 +119,7 @@ struct BitcodeHeader { }; struct ProgramHeader { - uint8_t MinorVersion : 4; - uint8_t MajorVersion : 4; + uint8_t Version; uint8_t Unused; uint16_t ShaderKind; uint32_t Size; // Size in uint32_t words including this header. @@ -131,6 +130,11 @@ struct ProgramHeader { sys::swapByteOrder(Size); Bitcode.swapBytes(); } + uint8_t getMajorVersion() { return Version >> 4; } + uint8_t getMinorVersion() { return Version & 0xF; } + static uint8_t getVersion(uint8_t Major, uint8_t Minor) { + return (Major << 4) | Minor; + } }; static_assert(sizeof(ProgramHeader) == 24, "ProgramHeader Size incorrect!"); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index fa481886b268a..2091432d4fe27 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -233,8 +233,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // The cost of the scalar loads/stores. InstructionCost MemoryOpCost = - VF * getMemoryOpCost(Opcode, VT->getElementType(), Alignment, - AddressSpace, CostKind); + VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment, + AddressSpace, CostKind); // Next, compute the cost of packing the result in a vector. InstructionCost PackingCost = @@ -253,8 +253,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { getScalarizationOverhead( FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF), /*Insert=*/false, /*Extract=*/true, CostKind) + - VF * (getCFInstrCost(Instruction::Br, CostKind) + - getCFInstrCost(Instruction::PHI, CostKind)); + VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) + + thisT()->getCFInstrCost(Instruction::PHI, CostKind)); } return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost; diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h index a9b19dc56f16a..256d98423e030 100644 --- a/llvm/include/llvm/MC/MCFragment.h +++ b/llvm/include/llvm/MC/MCFragment.h @@ -33,6 +33,7 @@ class MCFragment : public ilist_node_with_parent { public: enum FragmentType : uint8_t { FT_Align, + FT_NeverAlign, FT_Data, FT_CompactEncodedInst, FT_Fill, @@ -344,6 +345,27 @@ class MCAlignFragment : public MCFragment { } }; +class MCNeverAlignFragment : public MCFragment { + /// The alignment the end of the next fragment should avoid. + unsigned Alignment; + + /// When emitting Nops some subtargets have specific nop encodings. + const MCSubtargetInfo &STI; + +public: + MCNeverAlignFragment(unsigned Alignment, const MCSubtargetInfo &STI, + MCSection *Sec = nullptr) + : MCFragment(FT_NeverAlign, false, Sec), Alignment(Alignment), STI(STI) {} + + unsigned getAlignment() const { return Alignment; } + + const MCSubtargetInfo &getSubtargetInfo() const { return STI; } + + static bool classof(const MCFragment *F) { + return F->getKind() == MCFragment::FT_NeverAlign; + } +}; + class MCFillFragment : public MCFragment { uint8_t ValueSize; /// Value to use for filling bytes. diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h index e212d54613980..c7d760721e369 100644 --- a/llvm/include/llvm/MC/MCObjectStreamer.h +++ b/llvm/include/llvm/MC/MCObjectStreamer.h @@ -157,6 +157,8 @@ class MCObjectStreamer : public MCStreamer { unsigned MaxBytesToEmit = 0) override; void emitCodeAlignment(Align ByteAlignment, const MCSubtargetInfo *STI, unsigned MaxBytesToEmit = 0) override; + void emitNeverAlignCodeAtEnd(unsigned ByteAlignment, + const MCSubtargetInfo &STI) override; void emitValueToOffset(const MCExpr *Offset, unsigned char Value, SMLoc Loc) override; void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column, diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index 69867620e1bf8..544225de6e370 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -885,6 +885,12 @@ class MCStreamer { virtual void emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI, unsigned MaxBytesToEmit = 0); + /// If the end of the fragment following this NeverAlign fragment ever gets + /// aligned to \p ByteAlignment, this fragment emits a single nop before the + /// following fragment to break this end-alignment. + virtual void emitNeverAlignCodeAtEnd(unsigned ByteAlignment, + const MCSubtargetInfo &STI); + /// Emit some number of copies of \p Value until the byte offset \p /// Offset is reached. /// diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index a9ed56fcd4700..5670767ff7edf 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -253,6 +253,7 @@ X86_FEATURE_COMPAT(USERMSR, "usermsr", 0) X86_FEATURE_COMPAT(AVX10_1, "avx10.1-256", 0) X86_FEATURE_COMPAT(AVX10_1_512, "avx10.1-512", 0) X86_FEATURE (EVEX512, "evex512") +X86_FEATURE (NF, "nf") X86_FEATURE (CF, "cf") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index ad30b5ce9e631..62baeb93ea7d0 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -298,6 +298,43 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup, return IsResolved; } +/// Check if the branch crosses the boundary. +/// +/// \param StartAddr start address of the fused/unfused branch. +/// \param Size size of the fused/unfused branch. +/// \param BoundaryAlignment alignment requirement of the branch. +/// \returns true if the branch cross the boundary. +static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size, + Align BoundaryAlignment) { + uint64_t EndAddr = StartAddr + Size; + return (StartAddr >> Log2(BoundaryAlignment)) != + ((EndAddr - 1) >> Log2(BoundaryAlignment)); +} + +/// Check if the branch is against the boundary. +/// +/// \param StartAddr start address of the fused/unfused branch. +/// \param Size size of the fused/unfused branch. +/// \param BoundaryAlignment alignment requirement of the branch. +/// \returns true if the branch is against the boundary. +static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size, + Align BoundaryAlignment) { + uint64_t EndAddr = StartAddr + Size; + return (EndAddr & (BoundaryAlignment.value() - 1)) == 0; +} + +/// Check if the branch needs padding. +/// +/// \param StartAddr start address of the fused/unfused branch. +/// \param Size size of the fused/unfused branch. +/// \param BoundaryAlignment alignment requirement of the branch. +/// \returns true if the branch needs padding. +static bool needPadding(uint64_t StartAddr, uint64_t Size, + Align BoundaryAlignment) { + return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) || + isAgainstBoundary(StartAddr, Size, BoundaryAlignment); +} + uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, const MCFragment &F) const { assert(getBackendPtr() && "Requires assembler backend"); @@ -358,6 +395,41 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, return Size; } + case MCFragment::FT_NeverAlign: { + // Disclaimer: NeverAlign fragment size depends on the size of its immediate + // successor, but NeverAlign need not be a MCRelaxableFragment. + // NeverAlign fragment size is recomputed if the successor is relaxed: + // - If RelaxableFragment is relaxed, it gets invalidated by marking its + // predecessor as LastValidFragment. + // - This forces the assembler to call MCAsmLayout::layoutFragment on that + // relaxable fragment, which in turn will always ask the predecessor to + // compute its size (see "computeFragmentSize(prev)" in layoutFragment). + // + // In short, the simplest way to ensure that computeFragmentSize() is sane + // is to establish the following rule: it should never examine fragments + // after the current fragment in the section. If we logically need to + // examine any fragment after the current fragment, we need to do that using + // relaxation, inside MCAssembler::layoutSectionOnce. + const MCNeverAlignFragment &NAF = cast(F); + const MCFragment *NF = F.getNextNode(); + uint64_t Offset = Layout.getFragmentOffset(&NAF); + size_t NextFragSize = 0; + if (const auto *NextFrag = dyn_cast(NF)) { + NextFragSize = NextFrag->getContents().size(); + } else if (const auto *NextFrag = dyn_cast(NF)) { + NextFragSize = NextFrag->getContents().size(); + } else { + llvm_unreachable("Didn't find the expected fragment after NeverAlign"); + } + // Check if the next fragment ends at the alignment we want to avoid. + if (isAgainstBoundary(Offset, NextFragSize, Align(NAF.getAlignment()))) { + // Avoid this alignment by introducing minimum nop. + assert(getBackend().getMinimumNopSize() != NAF.getAlignment()); + return getBackend().getMinimumNopSize(); + } + return 0; + } + case MCFragment::FT_Org: { const MCOrgFragment &OF = cast(F); MCValue Value; @@ -581,6 +653,15 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm, break; } + case MCFragment::FT_NeverAlign: { + const MCNeverAlignFragment &NAF = cast(F); + if (!Asm.getBackend().writeNopData(OS, FragmentSize, + &NAF.getSubtargetInfo())) + report_fatal_error("unable to write nop sequence of " + + Twine(FragmentSize) + " bytes"); + break; + } + case MCFragment::FT_Data: ++stats::EmittedDataFragments; OS << cast(F).getContents(); @@ -1052,43 +1133,6 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) { return OldSize != LF.getContents().size(); } -/// Check if the branch crosses the boundary. -/// -/// \param StartAddr start address of the fused/unfused branch. -/// \param Size size of the fused/unfused branch. -/// \param BoundaryAlignment alignment requirement of the branch. -/// \returns true if the branch cross the boundary. -static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size, - Align BoundaryAlignment) { - uint64_t EndAddr = StartAddr + Size; - return (StartAddr >> Log2(BoundaryAlignment)) != - ((EndAddr - 1) >> Log2(BoundaryAlignment)); -} - -/// Check if the branch is against the boundary. -/// -/// \param StartAddr start address of the fused/unfused branch. -/// \param Size size of the fused/unfused branch. -/// \param BoundaryAlignment alignment requirement of the branch. -/// \returns true if the branch is against the boundary. -static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size, - Align BoundaryAlignment) { - uint64_t EndAddr = StartAddr + Size; - return (EndAddr & (BoundaryAlignment.value() - 1)) == 0; -} - -/// Check if the branch needs padding. -/// -/// \param StartAddr start address of the fused/unfused branch. -/// \param Size size of the fused/unfused branch. -/// \param BoundaryAlignment alignment requirement of the branch. -/// \returns true if the branch needs padding. -static bool needPadding(uint64_t StartAddr, uint64_t Size, - Align BoundaryAlignment) { - return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) || - isAgainstBoundary(StartAddr, Size, BoundaryAlignment); -} - bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout, MCBoundaryAlignFragment &BF) { // BoundaryAlignFragment that doesn't need to align any fragment should not be diff --git a/llvm/lib/MC/MCDXContainerWriter.cpp b/llvm/lib/MC/MCDXContainerWriter.cpp index 0580dc7e42826..ff64c6e538ac6 100644 --- a/llvm/lib/MC/MCDXContainerWriter.cpp +++ b/llvm/lib/MC/MCDXContainerWriter.cpp @@ -117,9 +117,11 @@ uint64_t DXContainerObjectWriter::writeObject(MCAssembler &Asm, const Triple &TT = Asm.getContext().getTargetTriple(); VersionTuple Version = TT.getOSVersion(); - Header.MajorVersion = static_cast(Version.getMajor()); - if (Version.getMinor()) - Header.MinorVersion = static_cast(*Version.getMinor()); + uint8_t MajorVersion = static_cast(Version.getMajor()); + uint8_t MinorVersion = + static_cast(Version.getMinor().value_or(0)); + Header.Version = + dxbc::ProgramHeader::getVersion(MajorVersion, MinorVersion); if (TT.hasEnvironment()) Header.ShaderKind = static_cast(TT.getEnvironment() - Triple::Pixel); diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp index a8da46dbd8727..2626da7e0391a 100644 --- a/llvm/lib/MC/MCFragment.cpp +++ b/llvm/lib/MC/MCFragment.cpp @@ -274,6 +274,9 @@ void MCFragment::destroy() { case FT_Align: delete cast(this); return; + case FT_NeverAlign: + delete cast(this); + return; case FT_Data: delete cast(this); return; @@ -342,6 +345,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { OS << "<"; switch (getKind()) { case MCFragment::FT_Align: OS << "MCAlignFragment"; break; + case MCFragment::FT_NeverAlign: + OS << "MCNeverAlignFragment"; + break; case MCFragment::FT_Data: OS << "MCDataFragment"; break; case MCFragment::FT_CompactEncodedInst: OS << "MCCompactEncodedInstFragment"; break; @@ -381,6 +387,12 @@ LLVM_DUMP_METHOD void MCFragment::dump() const { << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">"; break; } + case MCFragment::FT_NeverAlign: { + const MCNeverAlignFragment *NAF = cast(this); + OS << "\n "; + OS << " Alignment:" << NAF->getAlignment() << ">"; + break; + } case MCFragment::FT_Data: { const auto *DF = cast(this); OS << "\n "; diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index d2da5d0d3f90f..9d0ee3913e582 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -661,6 +661,11 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment, cast(getCurrentFragment())->setEmitNops(true, STI); } +void MCObjectStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment, + const MCSubtargetInfo &STI) { + insert(new MCNeverAlignFragment(ByteAlignment, STI)); +} + void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value, SMLoc Loc) { diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index 176d55aa890be..9d341497bac45 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -1235,6 +1235,8 @@ void MCStreamer::emitValueToAlignment(Align Alignment, int64_t Value, unsigned MaxBytesToEmit) {} void MCStreamer::emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI, unsigned MaxBytesToEmit) {} +void MCStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment, + const MCSubtargetInfo &STI) {} void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value, SMLoc Loc) {} void MCStreamer::emitBundleAlignMode(Align Alignment) {} diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index f3a518df31750..175f1a12f9314 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -131,8 +131,8 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { if (!P.Program) continue; dxbc::ProgramHeader Header; - Header.MajorVersion = P.Program->MajorVersion; - Header.MinorVersion = P.Program->MinorVersion; + Header.Version = dxbc::ProgramHeader::getVersion(P.Program->MajorVersion, + P.Program->MinorVersion); Header.Unused = 0; Header.ShaderKind = P.Program->ShaderKind; memcpy(Header.Bitcode.Magic, "DXIL", 4); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 26229af638f22..0e3bc63919f06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -18,6 +18,7 @@ #include "llvm/BinaryFormat/MsgPackDocument.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/Compiler.h" namespace llvm { @@ -61,7 +62,8 @@ class MetadataStreamer { msgpack::MapDocNode Kern) = 0; }; -class MetadataStreamerMsgPackV4 : public MetadataStreamer { +class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4 + : public MetadataStreamer { protected: std::unique_ptr HSAMetadataDoc = std::make_unique(); diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index c0a353033c3c5..e66e5a194c8b5 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H #include "llvm/IR/CallingConv.h" +#include "llvm/Support/Compiler.h" #include namespace llvm { @@ -27,7 +28,7 @@ class MCExpr; class MachineFunction; /// Track resource usage for kernels / entry functions. -struct SIProgramInfo { +struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { // Fields set in PGM_RSRC1 pm4 packet. const MCExpr *VGPRBlocks = nullptr; const MCExpr *SGPRBlocks = nullptr; diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 62b4a9278954c..c4addfcfe2b51 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1153,6 +1153,7 @@ class X86AsmParser : public MCTargetAsmParser { bool parseDirectiveArch(); bool parseDirectiveNops(SMLoc L); bool parseDirectiveEven(SMLoc L); + bool parseDirectiveAvoidEndAlign(SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); /// CodeView FPO data directives. @@ -4601,6 +4602,8 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return false; } else if (IDVal == ".nops") return parseDirectiveNops(DirectiveID.getLoc()); + else if (IDVal == ".avoid_end_align") + return parseDirectiveAvoidEndAlign(DirectiveID.getLoc()); else if (IDVal == ".even") return parseDirectiveEven(DirectiveID.getLoc()); else if (IDVal == ".cv_fpo_proc") @@ -4695,6 +4698,27 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) { return false; } +/// Directive for NeverAlign fragment testing, not for general usage! +/// parseDirectiveAvoidEndAlign +/// ::= .avoid_end_align alignment +bool X86AsmParser::parseDirectiveAvoidEndAlign(SMLoc L) { + int64_t Alignment = 0; + SMLoc AlignmentLoc; + AlignmentLoc = getTok().getLoc(); + if (getParser().checkForValidSection() || + getParser().parseAbsoluteExpression(Alignment)) + return true; + + if (getParser().parseEOL("unexpected token in directive")) + return true; + + if (Alignment <= 0) + return Error(AlignmentLoc, "expected a positive alignment"); + + getParser().getStreamer().emitNeverAlignCodeAtEnd(Alignment, getSTI()); + return false; +} + /// ParseDirectiveCode /// ::= .code16 | .code32 | .code64 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 9f5b58d78fcce..54642ecde18c0 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -351,6 +351,8 @@ def FeatureNDD : SubtargetFeature<"ndd", "HasNDD", "true", "Support non-destructive destination">; def FeatureCCMP : SubtargetFeature<"ccmp", "HasCCMP", "true", "Support conditional cmp & test instructions">; +def FeatureNF : SubtargetFeature<"nf", "HasNF", "true", + "Support status flags update suppression">; def FeatureCF : SubtargetFeature<"cf", "HasCF", "true", "Support conditional faulting">; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 14c62893766ad..ea3b84d0ca9eb 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1560,7 +1560,9 @@ void X86DAGToDAGISel::PostprocessISelDAG() { SDValue And = N->getOperand(0); unsigned N0Opc = And.getMachineOpcode(); if ((N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || - N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) && + N0Opc == X86::AND32rr || N0Opc == X86::AND64rr || + N0Opc == X86::AND8rr_ND || N0Opc == X86::AND16rr_ND || + N0Opc == X86::AND32rr_ND || N0Opc == X86::AND64rr_ND) && !And->hasAnyUseOfValue(1)) { MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, @@ -1571,15 +1573,25 @@ void X86DAGToDAGISel::PostprocessISelDAG() { continue; } if ((N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || - N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) && + N0Opc == X86::AND32rm || N0Opc == X86::AND64rm || + N0Opc == X86::AND8rm_ND || N0Opc == X86::AND16rm_ND || + N0Opc == X86::AND32rm_ND || N0Opc == X86::AND64rm_ND) && !And->hasAnyUseOfValue(1)) { unsigned NewOpc; +#define CASE_ND(OP) \ + case X86::OP: \ + case X86::OP##_ND: +#define FROM_TO(A, B) \ + CASE_ND(A) NewOpc = X86::B; \ + break; switch (N0Opc) { - case X86::AND8rm: NewOpc = X86::TEST8mr; break; - case X86::AND16rm: NewOpc = X86::TEST16mr; break; - case X86::AND32rm: NewOpc = X86::TEST32mr; break; - case X86::AND64rm: NewOpc = X86::TEST64mr; break; + FROM_TO(AND8rm, TEST8mr); + FROM_TO(AND16rm, TEST16mr); + FROM_TO(AND32rm, TEST32mr); + FROM_TO(AND64rm, TEST64mr); } +#undef FROM_TO +#undef CASE_ND // Need to swap the memory and register operand. SDValue Ops[] = { And.getOperand(1), diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index d43480d0a0125..0a0bc7b32e87f 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5769,7 +5769,6 @@ int X86TTIImpl::getScatterOverhead() const { } // Return an average cost of Gather / Scatter instruction, maybe improved later. -// FIXME: Add TargetCostKind support. InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind, Type *SrcVTy, const Value *Ptr, @@ -5841,62 +5840,12 @@ InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, CostKind); } -/// Return the cost of full scalarization of gather / scatter operation. -/// -/// Opcode - Load or Store instruction. -/// SrcVTy - The type of the data vector that should be gathered or scattered. -/// VariableMask - The mask is non-constant at compile time. -/// Alignment - Alignment for one element. -/// AddressSpace - pointer[s] address space. -/// TODO: Remove this and use getCommonMaskedMemoryOpCost directly. -InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, - TTI::TargetCostKind CostKind, - Type *SrcVTy, bool VariableMask, - Align Alignment, - unsigned AddressSpace) { - Type *ScalarTy = SrcVTy->getScalarType(); - unsigned VF = cast(SrcVTy)->getNumElements(); - APInt DemandedElts = APInt::getAllOnes(VF); - - InstructionCost MaskUnpackCost = 0; - if (VariableMask) { - auto *MaskTy = - FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); - MaskUnpackCost = getScalarizationOverhead( - MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); - InstructionCost ScalarCompareCost = getCmpSelInstrCost( - Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, - CmpInst::BAD_ICMP_PREDICATE, CostKind); - InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); - MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); - } - - InstructionCost AddressUnpackCost = getScalarizationOverhead( - FixedVectorType::get(PointerType::getUnqual(ScalarTy->getContext()), VF), - DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); - - // The cost of the scalar loads/stores. - InstructionCost MemoryOpCost = - VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment), - AddressSpace, CostKind); - - // The cost of forming the vector from loaded scalars/ - // scalarizing the vector to perform scalar stores. - InstructionCost InsertExtractCost = getScalarizationOverhead( - cast(SrcVTy), DemandedElts, - /*Insert=*/Opcode == Instruction::Load, - /*Extract=*/Opcode == Instruction::Store, CostKind); - - return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; -} - /// Calculate the cost of Gather / Scatter operation InstructionCost X86TTIImpl::getGatherScatterOpCost( unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { - if (CostKind != TTI::TCK_RecipThroughput && - ((Opcode == Instruction::Load && + if (((Opcode == Instruction::Load && (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || forceScalarizeMaskedGather(cast(SrcVTy), Align(Alignment)))) || @@ -5914,18 +5863,6 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost( cast(Ptr->getType())->getElementType()); assert(PtrTy && "Unexpected type for Ptr argument"); unsigned AddressSpace = PtrTy->getAddressSpace(); - - if ((Opcode == Instruction::Load && - (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || - forceScalarizeMaskedGather(cast(SrcVTy), - Align(Alignment)))) || - (Opcode == Instruction::Store && - (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || - forceScalarizeMaskedScatter(cast(SrcVTy), - Align(Alignment))))) - return getGSScalarCost(Opcode, CostKind, SrcVTy, VariableMask, Alignment, - AddressSpace); - return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment, AddressSpace); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index d720cc136b8ae..e14dc9fc09051 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -295,9 +295,6 @@ class X86TTIImpl : public BasicTTIImplBase { private: bool supportsGather() const; - InstructionCost getGSScalarCost(unsigned Opcode, TTI::TargetCostKind CostKind, - Type *DataTy, bool VariableMask, - Align Alignment, unsigned AddressSpace); InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind, Type *DataTy, const Value *Ptr, Align Alignment, unsigned AddressSpace); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 21f46f576490a..efe392b945452 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -632,6 +632,7 @@ constexpr FeatureBitset ImpliedFeaturesPush2Pop2 = {}; constexpr FeatureBitset ImpliedFeaturesPPX = {}; constexpr FeatureBitset ImpliedFeaturesNDD = {}; constexpr FeatureBitset ImpliedFeaturesCCMP = {}; +constexpr FeatureBitset ImpliedFeaturesNF = {}; constexpr FeatureBitset ImpliedFeaturesCF = {}; constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = { diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 802467b5b1835..74a8f1958dfe9 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1279,9 +1279,11 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // only happen in dead loops. if (AvailableVal == LoadI) AvailableVal = PoisonValue::get(LoadI->getType()); - if (AvailableVal->getType() != LoadI->getType()) + if (AvailableVal->getType() != LoadI->getType()) { AvailableVal = CastInst::CreateBitOrPointerCast( AvailableVal, LoadI->getType(), "", LoadI->getIterator()); + cast(AvailableVal)->setDebugLoc(LoadI->getDebugLoc()); + } LoadI->replaceAllUsesWith(AvailableVal); LoadI->eraseFromParent(); return true; @@ -2987,6 +2989,7 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) { PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI->getIterator()); NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); NewPN->addIncoming(SI->getFalseValue(), BB); + NewPN->setDebugLoc(SI->getDebugLoc()); SI->replaceAllUsesWith(NewPN); SI->eraseFromParent(); // NewBB and SplitBB are newly created blocks which require insertion. @@ -3124,6 +3127,7 @@ bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard, PHINode *NewPN = PHINode::Create(Inst->getType(), 2); NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock); NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock); + NewPN->setDebugLoc(Inst->getDebugLoc()); NewPN->insertBefore(InsertionPoint); Inst->replaceAllUsesWith(NewPN); } diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index e50413de46b1b..6aa4188d1cc4d 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -933,12 +933,14 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent()); ReciprocalDivisor->insertBefore(&I); + ReciprocalDivisor->setDebugLoc(I.getDebugLoc()); auto Product = BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); Product->setFastMathFlags(I.getFastMathFlags()); SafetyInfo->insertInstructionTo(Product, I.getParent()); Product->insertAfter(&I); + Product->setDebugLoc(I.getDebugLoc()); I.replaceAllUsesWith(Product); eraseInstruction(I, *SafetyInfo, MSSAU); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 8573a8adf53b3..9d43fb4ab607c 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1729,7 +1729,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { return false; // Look for an identity value. - if (Item[0].second == 0 && Item[0].first->getType() == Ty && + if (Item[0].second == 0 && + cast(Item[0].first->getType())->getNumElements() == + Ty->getNumElements() && all_of(drop_begin(enumerate(Item)), [&](const auto &E) { return !E.value().first || (E.value().first == Item[0].first && E.value().second == (int)E.index()); @@ -1773,6 +1775,20 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 1)); } else if (isa(Item[0].first)) { Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, 0)); + } else if (auto *II = dyn_cast(Item[0].first); + II && isTriviallyVectorizable(II->getIntrinsicID())) { + for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) { + if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) { + if (!all_of(drop_begin(Item), [&](InstLane &IL) { + return !IL.first || + (cast(IL.first)->getOperand(Op) == + cast(Item[0].first)->getOperand(Op)); + })) + return false; + continue; + } + Worklist.push_back(GenerateInstLaneVectorFromOperand(Item, Op)); + } } else { return false; } @@ -1799,13 +1815,24 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { } auto *I = cast(Item[0].first); - SmallVector Ops(I->getNumOperands()); - for (unsigned Idx = 0, E = I->getNumOperands(); Idx < E; Idx++) + auto *II = dyn_cast(I); + unsigned NumOps = I->getNumOperands() - (II ? 1 : 0); + SmallVector Ops(NumOps); + for (unsigned Idx = 0; Idx < NumOps; Idx++) { + if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) { + Ops[Idx] = II->getOperand(Idx); + continue; + } Ops[Idx] = Generate(GenerateInstLaneVectorFromOperand(Item, Idx)); + } Builder.SetInsertPoint(I); + Type *DstTy = FixedVectorType::get(I->getType()->getScalarType(), + Ty->getNumElements()); if (auto BI = dyn_cast(I)) return Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(), Ops[0], Ops[1]); + if (II) + return Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops); assert(isa(I) && "Unexpected instruction type in Generate"); return Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]); diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll index f5ca6a22b60ac..af41ed92319cf 100644 --- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll +++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll @@ -5,24 +5,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @fixed() { ; CHECK-LABEL: 'fixed' -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll index 521a0900c844e..6f9f64a26851a 100644 --- a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll +++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll @@ -212,11 +212,11 @@ define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) { define <4 x i8> @gather_load_4xi8_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) { ; CHECK: gather_load_4xi8_variable_mask ; CHECK-NEON-LABEL: 'gather_load_4xi8_variable_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv ; ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_variable_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv ; ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_variable_mask' @@ -257,11 +257,11 @@ define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) { define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %cond) { ; CHECK: scatter_store_4xi8_variable_mask ; CHECK-NEON-LABEL: 'scatter_store_4xi8_variable_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_variable_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_variable_mask' @@ -302,11 +302,11 @@ define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) { define <4 x i32> @gather_load_4xi32_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) { ; CHECK: gather_load_4xi32_variable_mask ; CHECK-NEON-LABEL: 'gather_load_4xi32_variable_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv ; ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_variable_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv ; ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_variable_mask' @@ -347,11 +347,11 @@ define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs) define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %cond) { ; CHECK: scatter_store_4xi32_variable_mask ; CHECK-NEON-LABEL: 'scatter_store_4xi32_variable_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_variable_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_variable_mask' @@ -370,11 +370,11 @@ declare <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr>, i32, <256 x define void @sve_gather_vls(<256 x i1> %v256i1mask) { ; CHECK-LABEL: 'sve_scatter_vls' ; CHECK-NEON-LABEL: 'sve_gather_vls' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2304 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'sve_gather_vls' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 2304 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'sve_gather_vls' @@ -394,11 +394,11 @@ declare <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr>, i32, <256 define void @sve_gather_vls_float(<256 x i1> %v256i1mask) { ; CHECK-LABEL: 'sve_gather_vls_float' ; CHECK-NEON-LABEL: 'sve_gather_vls_float' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2176 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'sve_gather_vls_float' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 2176 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'sve_gather_vls_float' @@ -418,11 +418,11 @@ declare void @llvm.masked.scatter.v256i8.v256p0(<256 x i8>, <256 x ptr>, i32, <2 define void @sve_scatter_vls(<256 x i1> %v256i1mask){ ; CHECK-LABEL: 'sve_scatter_vls' ; CHECK-NEON-LABEL: 'sve_scatter_vls' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2304 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'sve_scatter_vls' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 2304 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'sve_scatter_vls' @@ -442,11 +442,11 @@ declare void @llvm.masked.scatter.v512f16.v512p0(<512 x half>, <512 x ptr>, i32, define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){ ; CHECK-LABEL: 'sve_scatter_vls_float' ; CHECK-NEON-LABEL: 'sve_scatter_vls_float' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 4480 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'sve_scatter_vls_float' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 4480 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'sve_scatter_vls_float' diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll index a9c18e20c1f58..c05339d89d35c 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll @@ -107,15 +107,15 @@ define void @masked_gathers_no_vscale_range() #2 { define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) #3 { ; CHECK-LABEL: 'masked_gather_v1i128' -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res ; ; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128' -; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) ; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res ; ; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128' -; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) ; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res ; %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll index 15c278b060c96..1ff280d75b4e9 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -829,7 +829,7 @@ define @masked_gather_nxv8i32( %ld, @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32> %passthru) { ; CHECK-LABEL: 'masked_gather_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v4i32' @@ -842,7 +842,7 @@ define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32> define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i128> %passthru) { ; CHECK-LABEL: 'masked_gather_v1i128' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <1 x i128> %res ; ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v1i128' @@ -883,7 +883,7 @@ define void @masked_scatter_nxv8i32( %data, define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %masks) { ; CHECK-LABEL: 'masked_scatter_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks) +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v4i32' @@ -897,7 +897,7 @@ define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %ma define void @masked_scatter_v1i128(<1 x i128> %data, <1 x ptr> %ptrs, <1 x i1> %masks) { ; CHECK-LABEL: 'masked_scatter_v1i128' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v1i128' diff --git a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll index 31bbc8b02a192..e6f53d4429c79 100644 --- a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll +++ b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll @@ -13,14 +13,14 @@ define void @fixed() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 287 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll index a55576089c7fe..06429a5107113 100644 --- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -310,7 +310,7 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) { ; THRU-LABEL: 'maskedgather' -; THRU-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) +; THRU-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'maskedgather' @@ -331,7 +331,7 @@ define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) { define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) { ; THRU-LABEL: 'maskedscatter' -; THRU-NEXT: Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc) +; THRU-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'maskedscatter' diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll index 80b350254e940..b62d3fb250915 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll @@ -49,8 +49,8 @@ define void @test() { ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4 -; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4 -; AVX512: LV: Found an estimated cost of 21 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4 +; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4 +; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4 ; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4 ; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll index 9146ca498237d..1d3e45765e511 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll @@ -49,8 +49,8 @@ define void @test() { ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8 -; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8 -; AVX512: LV: Found an estimated cost of 22 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8 +; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8 +; AVX512: LV: Found an estimated cost of 18 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8 diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll index b0f9f6d32a56f..25d12da306aba 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll @@ -732,118 +732,118 @@ define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) { ; SSE2-LABEL: 'masked_gather' -; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SSE42-LABEL: 'masked_gather' -; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX1-LABEL: 'masked_gather' -; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 214 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX2-LABEL: 'masked_gather' -; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SKL-LABEL: 'masked_gather' ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) @@ -851,73 +851,73 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8 ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; KNL-LABEL: 'masked_gather' ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SKX-LABEL: 'masked_gather' ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) @@ -955,192 +955,192 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8 define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) { ; SSE2-LABEL: 'masked_scatter' -; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE2-NEXT: Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SSE42-LABEL: 'masked_scatter' -; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE42-NEXT: Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE42-NEXT: Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 196 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; SSE42-NEXT: Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX1-LABEL: 'masked_scatter' -; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX1-NEXT: Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; AVX1-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; AVX1-NEXT: Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; AVX1-NEXT: Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX1-NEXT: Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; AVX1-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; AVX1-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX2-LABEL: 'masked_scatter' -; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; AVX2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX2-NEXT: Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; AVX2-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; AVX2-NEXT: Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; AVX2-NEXT: Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; AVX2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; AVX2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SKL-LABEL: 'masked_scatter' -; SKL-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SKL-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKL-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKL-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SKL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKL-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKL-NEXT: Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SKL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; SKL-NEXT: Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SKL-NEXT: Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SKL-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKL-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKL-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SKL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKL-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKL-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SKL-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; SKL-NEXT: Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SKL-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; KNL-LABEL: 'masked_scatter' ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; KNL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; KNL-NEXT: Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; KNL-NEXT: Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; KNL-NEXT: Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; KNL-NEXT: Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; KNL-NEXT: Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; KNL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; KNL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; KNL-NEXT: Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; KNL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; KNL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SKX-LABEL: 'masked_scatter' ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) ; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) ; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKX-NEXT: Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SKX-NEXT: Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; SKX-NEXT: Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SKX-NEXT: Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKX-NEXT: Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SKX-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKX-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKX-NEXT: Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; SKX-NEXT: Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SKX-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKX-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) @@ -1788,19 +1788,19 @@ define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) { define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0) { ; SSE2-LABEL: 'test_gather_2f64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; SSE42-LABEL: 'test_gather_2f64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; AVX1-LABEL: 'test_gather_2f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; AVX2-LABEL: 'test_gather_2f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; SKL-LABEL: 'test_gather_2f64' @@ -1808,7 +1808,7 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; AVX512-LABEL: 'test_gather_2f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) @@ -1817,19 +1817,19 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { ; SSE2-LABEL: 'test_gather_4i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; SSE42-LABEL: 'test_gather_4i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; AVX1-LABEL: 'test_gather_4i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; AVX2-LABEL: 'test_gather_4i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; SKL-LABEL: 'test_gather_4i32' @@ -1837,7 +1837,7 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; KNL-LABEL: 'test_gather_4i32' -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; SKX-LABEL: 'test_gather_4i32' @@ -1929,25 +1929,25 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 ; SSE2-LABEL: 'test_gather_16f32_var_mask' ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SSE42-LABEL: 'test_gather_16f32_var_mask' ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX1-LABEL: 'test_gather_16f32_var_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX2-LABEL: 'test_gather_16f32_var_mask' ; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SKL-LABEL: 'test_gather_16f32_var_mask' @@ -1973,25 +1973,25 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask' ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask' ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask' ; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SKL-LABEL: 'test_gather_16f32_ra_var_mask' @@ -2078,7 +2078,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_scatter_16i32' @@ -2086,7 +2086,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_scatter_16i32' @@ -2094,7 +2094,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_scatter_16i32' @@ -2102,7 +2102,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKL-LABEL: 'test_scatter_16i32' @@ -2110,7 +2110,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; SKL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SKL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_scatter_16i32' @@ -2132,15 +2132,15 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) { ; SSE2-LABEL: 'test_scatter_8i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_scatter_8i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test_scatter_8i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_scatter_8i32' @@ -2153,19 +2153,19 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) { define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) { ; SSE2-LABEL: 'test_scatter_4i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_scatter_4i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test_scatter_4i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; KNL-LABEL: 'test_scatter_4i32' -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKX-LABEL: 'test_scatter_4i32' @@ -2180,25 +2180,25 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) { ; SSE2-LABEL: 'test_gather_4f32' ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SSE42-LABEL: 'test_gather_4f32' ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; AVX1-LABEL: 'test_gather_4f32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; AVX2-LABEL: 'test_gather_4f32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SKL-LABEL: 'test_gather_4f32' @@ -2210,7 +2210,7 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) { ; KNL-LABEL: 'test_gather_4f32' ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SKX-LABEL: 'test_gather_4f32' diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll index 46123e9f60574..332d90ac4191c 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll @@ -732,118 +732,118 @@ define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) { ; SSE2-LABEL: 'masked_gather' -; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SSE42-LABEL: 'masked_gather' -; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX1-LABEL: 'masked_gather' -; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 214 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX2-LABEL: 'masked_gather' -; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SKL-LABEL: 'masked_gather' ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) @@ -851,73 +851,73 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8 ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; SKL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; KNL-LABEL: 'masked_gather' ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SKX-LABEL: 'masked_gather' ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef) +; SKX-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef) @@ -955,192 +955,192 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8 define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) { ; SSE2-LABEL: 'masked_scatter' -; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE2-NEXT: Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SSE2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE2-NEXT: Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; SSE2-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE2-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SSE42-LABEL: 'masked_scatter' -; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SSE42-NEXT: Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SSE42-NEXT: Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; SSE42-NEXT: Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SSE42-NEXT: Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SSE42-NEXT: Cost Model: Found an estimated cost of 196 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; SSE42-NEXT: Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX1-LABEL: 'masked_scatter' -; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX1-NEXT: Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; AVX1-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX1-NEXT: Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; AVX1-NEXT: Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; AVX1-NEXT: Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX1-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX1-NEXT: Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; AVX1-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX1-NEXT: Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; AVX1-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; AVX2-LABEL: 'masked_scatter' -; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; AVX2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; AVX2-NEXT: Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; AVX2-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; AVX2-NEXT: Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; AVX2-NEXT: Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; AVX2-NEXT: Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; AVX2-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; AVX2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; AVX2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; AVX2-NEXT: Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; AVX2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SKL-LABEL: 'masked_scatter' -; SKL-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SKL-NEXT: Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKL-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKL-NEXT: Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) -; SKL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKL-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKL-NEXT: Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SKL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKL-NEXT: Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; SKL-NEXT: Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SKL-NEXT: Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SKL-NEXT: Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKL-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKL-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SKL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKL-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKL-NEXT: Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SKL-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKL-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKL-NEXT: Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; SKL-NEXT: Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SKL-NEXT: Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; KNL-LABEL: 'masked_scatter' ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; KNL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; KNL-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) ; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) ; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; KNL-NEXT: Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; KNL-NEXT: Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; KNL-NEXT: Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; KNL-NEXT: Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; KNL-NEXT: Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; KNL-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; KNL-NEXT: Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; KNL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; KNL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; KNL-NEXT: Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; KNL-NEXT: Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; KNL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; KNL-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; ; SKX-LABEL: 'masked_scatter' ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) ; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1) ; SKX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) ; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) -; SKX-NEXT: Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SKX-NEXT: Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) -; SKX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) -; SKX-NEXT: Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) -; SKX-NEXT: Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) -; SKX-NEXT: Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) -; SKX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2) +; SKX-NEXT: Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SKX-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKX-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) +; SKX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4) +; SKX-NEXT: Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64) +; SKX-NEXT: Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32) +; SKX-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16) +; SKX-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) ; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8) @@ -1788,19 +1788,19 @@ define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) { define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0) { ; SSE2-LABEL: 'test_gather_2f64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; SSE42-LABEL: 'test_gather_2f64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; AVX1-LABEL: 'test_gather_2f64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; AVX2-LABEL: 'test_gather_2f64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; SKL-LABEL: 'test_gather_2f64' @@ -1808,7 +1808,7 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; ; AVX512-LABEL: 'test_gather_2f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %res ; %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0) @@ -1817,19 +1817,19 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0) { ; SSE2-LABEL: 'test_gather_4i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; SSE42-LABEL: 'test_gather_4i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; AVX1-LABEL: 'test_gather_4i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; AVX2-LABEL: 'test_gather_4i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; SKL-LABEL: 'test_gather_4i32' @@ -1837,7 +1837,7 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; KNL-LABEL: 'test_gather_4i32' -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; SKX-LABEL: 'test_gather_4i32' @@ -1929,25 +1929,25 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 ; SSE2-LABEL: 'test_gather_16f32_var_mask' ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SSE42-LABEL: 'test_gather_16f32_var_mask' ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX1-LABEL: 'test_gather_16f32_var_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX2-LABEL: 'test_gather_16f32_var_mask' ; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SKL-LABEL: 'test_gather_16f32_var_mask' @@ -1973,25 +1973,25 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask' ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask' ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask' ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask' ; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %res ; ; SKL-LABEL: 'test_gather_16f32_ra_var_mask' @@ -2078,7 +2078,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; SSE2-NEXT: Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_scatter_16i32' @@ -2086,7 +2086,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX1-LABEL: 'test_scatter_16i32' @@ -2094,7 +2094,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; AVX1-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX1-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX2-LABEL: 'test_scatter_16i32' @@ -2102,7 +2102,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; AVX2-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; AVX2-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKL-LABEL: 'test_scatter_16i32' @@ -2110,7 +2110,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind ; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1> -; SKL-NEXT: Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) +; SKL-NEXT: Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) ; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_scatter_16i32' @@ -2132,15 +2132,15 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32 define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) { ; SSE2-LABEL: 'test_scatter_8i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_scatter_8i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test_scatter_8i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'test_scatter_8i32' @@ -2153,19 +2153,19 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) { define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) { ; SSE2-LABEL: 'test_scatter_4i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SSE42-LABEL: 'test_scatter_4i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX-LABEL: 'test_scatter_4i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; KNL-LABEL: 'test_scatter_4i32' -; KNL-NEXT: Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) +; KNL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SKX-LABEL: 'test_scatter_4i32' @@ -2180,25 +2180,25 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) { ; SSE2-LABEL: 'test_gather_4f32' ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SSE42-LABEL: 'test_gather_4f32' ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; AVX1-LABEL: 'test_gather_4f32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; AVX2-LABEL: 'test_gather_4f32' ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SKL-LABEL: 'test_gather_4f32' @@ -2210,7 +2210,7 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) { ; KNL-LABEL: 'test_gather_4f32' ; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64> ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind -; KNL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) +; KNL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef) ; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %res ; ; SKX-LABEL: 'test_gather_4f32' diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll index 402da547613c5..5a63d36a6be4e 100644 --- a/llvm/test/CodeGen/X86/cmp.ll +++ b/llvm/test/CodeGen/X86/cmp.ll @@ -787,25 +787,15 @@ define i1 @shifted_mask64_testl(i64 %a) { } define i1 @shifted_mask64_extra_use_const(i64 %a) { -; NO-NDD-LABEL: shifted_mask64_extra_use_const: -; NO-NDD: # %bb.0: -; NO-NDD-NEXT: movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03] -; NO-NDD-NEXT: # imm = 0x3FC000000000000 -; NO-NDD-NEXT: testq %rcx, %rdi # encoding: [0x48,0x85,0xcf] -; NO-NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] -; NO-NDD-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] -; NO-NDD-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte -; NO-NDD-NEXT: retq # encoding: [0xc3] -; -; NDD-LABEL: shifted_mask64_extra_use_const: -; NDD: # %bb.0: -; NDD-NEXT: movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03] -; NDD-NEXT: # imm = 0x3FC000000000000 -; NDD-NEXT: andq %rcx, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x21,0xcf] -; NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] -; NDD-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] -; NDD-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte -; NDD-NEXT: retq # encoding: [0xc3] +; CHECK-LABEL: shifted_mask64_extra_use_const: +; CHECK: # %bb.0: +; CHECK-NEXT: movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03] +; CHECK-NEXT: # imm = 0x3FC000000000000 +; CHECK-NEXT: testq %rcx, %rdi # encoding: [0x48,0x85,0xcf] +; CHECK-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] +; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 287104476244869120 ; 0xff << 50 %v1 = icmp ne i64 %v0, 0 store i64 287104476244869120, ptr @d64 @@ -954,19 +944,12 @@ declare i32 @f() ; The store makes sure the chain result of the load is used which used to ; prevent the post isel peephole from catching this. define i1 @fold_test_and_with_chain(ptr %x, ptr %y, i32 %z) { -; NO-NDD-LABEL: fold_test_and_with_chain: -; NO-NDD: # %bb.0: -; NO-NDD-NEXT: testl %edx, (%rdi) # encoding: [0x85,0x17] -; NO-NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NO-NDD-NEXT: movl %edx, (%rsi) # encoding: [0x89,0x16] -; NO-NDD-NEXT: retq # encoding: [0xc3] -; -; NDD-LABEL: fold_test_and_with_chain: -; NDD: # %bb.0: -; NDD-NEXT: andl (%rdi), %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x17] -; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NDD-NEXT: movl %edx, (%rsi) # encoding: [0x89,0x16] -; NDD-NEXT: retq # encoding: [0xc3] +; CHECK-LABEL: fold_test_and_with_chain: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edx, (%rdi) # encoding: [0x85,0x17] +; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; CHECK-NEXT: movl %edx, (%rsi) # encoding: [0x89,0x16] +; CHECK-NEXT: retq # encoding: [0xc3] %a = load i32, ptr %x %b = and i32 %z, %a %c = icmp eq i32 %b, 0 diff --git a/llvm/test/MC/X86/directive-avoid_end_align.s b/llvm/test/MC/X86/directive-avoid_end_align.s new file mode 100644 index 0000000000000..1d748401edc12 --- /dev/null +++ b/llvm/test/MC/X86/directive-avoid_end_align.s @@ -0,0 +1,208 @@ +# RUN: llvm-mc -triple=x86_64 -filetype=obj %s | llvm-objdump --no-show-raw-insn -d - | FileCheck %s +# RUN: not llvm-mc -triple=x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR + +# avoid_end_align has no effect since test doesn't end at alignment boundary: +.avoid_end_align 64 +# CHECK-NOT: nop + testl %eax, %eax +# CHECK: testl %eax, %eax + je .LBB0 + +.fill 58, 1, 0x00 +# NeverAlign followed by MCDataFragment: +# avoid_end_align inserts nop because `test` would end at alignment boundary: +.avoid_end_align 64 +# CHECK: 3e: nop + testl %eax, %eax +# CHECK-NEXT: 3f: testl %eax, %eax + je .LBB0 +# CHECK-NEXT: 41: je +.LBB0: + retq + +.p2align 6 +.L0: +.nops 57 + int3 +# NeverAlign followed by RelaxableFragment: +.avoid_end_align 64 +# CHECK: ba: nop + cmpl $(.L1-.L0), %eax +# CHECK-NEXT: bb: cmpl + je .L0 +# CHECK-NEXT: c1: je +.nops 65 +.L1: + +############################################################################### +# Experiment A: +# Check that NeverAlign doesn't introduce infinite loops in layout. +# Control: +# 1. NeverAlign fragment is not added, +# 2. Short formats of cmp and jcc are used (3 and 2 bytes respectively), +# 3. cmp and jcc are placed such that to be split by 64B alignment boundary. +# 4. jcc would be relaxed to a longer format if at least one byte is added +# between .L10 and je itself, e.g. by adding a NeverAlign padding byte, +# or relaxing cmp instruction. +# 5. cmp would be relaxed to a longer format if at least one byte is added +# between .L11 and .L12, e.g. due to relaxing jcc instruction. +.p2align 6 +# CHECK: 140: int3 +.fill 2, 1, 0xcc +.L10: +.nops 122 + int3 +# CHECK: 1bc: int3 +# no avoid_end_align here +# CHECK-NOT: nop + cmp $(.L12-.L11), %eax +# CHECK: 1bd: cmpl +.L11: + je .L10 +# CHECK-NEXT: 1c0: je +.nops 125 +.L12: + +# Experiment: +# Same setup as control, except NeverAlign fragment is added before cmp. +# Expected effect: +# 1. NeverAlign pads cmp+jcc by one byte since cmp and jcc are split by a 64B +# alignment boundary, +# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4), +# 3. This results in an cmp relaxation (Control rule #5), +# 4. Which in turn makes NeverAlign fragment unnecessary as cmp and jcc +# are no longer split by an alignment boundary (cmp crosses the boundary). +# 5. NeverAlign padding is removed. +# 6. cmp and jcc instruction remain in relaxed form. +# 7. Relaxation converges, layout succeeds. +.p2align 6 +# CHECK: 240: int3 +.fill 2, 1, 0xcc +.L20: +.nops 122 + int3 +# CHECK: 2bc: int3 +.avoid_end_align 64 +# CHECK-NOT: nop + cmp $(.L22-.L21), %eax +# CHECK-NEXT: 2bd: cmpl +.L21: + je .L20 +# CHECK-NEXT: 2c3: je +.nops 125 +.L22: + +############################################################################### +# Experiment B: similar to exp A, but we check that once NeverAlign padding is +# removed from the layout (exp A, experiment step 5), the increased distance +# between the symbols L33 and L34 triggers the relaxation of instruction at +# label L32. +# +# Control 1: using a one-byte instruction at L33 (site of NeverAlign) leads to +# steps 2-3 of exp A, experiment: +# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4), +# 3. This results in an cmp relaxation (Control rule #5), +# => short cmp under L32 +.p2align 6 +# CHECK: 380: int3 +.fill 2, 1, 0xcc +.L30: +.nops 122 + int3 +# CHECK: 3fc: int3 + hlt +#.avoid_end_align 64 +.L33: + cmp $(.L32-.L31), %eax +# CHECK: 3fe: cmpl +.L31: + je .L30 +# CHECK-NEXT: 404: je +.nops 114 +.p2align 1 + int3 + int3 +# CHECK: 47c: int3 +.L34: +.nops 9 +.L32: + cmp $(.L33-.L34), %eax +# CHECK: 487: cmp +# note that the size of cmp is 48a-487 == 3 bytes (distance is exactly -128) + int3 +# CHECK-NEXT: 48a: int3 + +# Control 2: leaving out a byte at L43 (site of NeverAlign), plus +# relaxed jcc and cmp leads to a relaxed cmp under L42 (-129 as cmp's immediate) +.p2align 6 +# CHECK: 4c0: int3 +.fill 2, 1, 0xcc +.L40: +.nops 122 + int3 +# CHECK: 53c: int3 +# int3 +#.avoid_end_align 64 +.L43: + cmp $(.L42-.L41+0x100), %eax +# CHECK: 53d: cmpl +.L41: + je .L40+0x100 +# CHECK-NEXT: 543: je +.nops 114 +.p2align 1 + int3 + int3 +# CHECK: 5bc: int3 +.L44: +.nops 9 +.L42: + cmp $(.L43-.L44), %eax +# CHECK: 5c7: cmp +# note that the size of cmp is 5cd-5c7 == 6 bytes (distance is exactly -129) + int3 +# CHECK-NEXT: 5cd: int3 + +# Experiment +# Checking if removing NeverAlign padding at L53 as a result of alignment and +# relaxation of cmp and jcc following it (see exp A), thus reproducing the case +# in Control 2 (getting a relaxed cmp under L52), is handled correctly. +.p2align 6 +# CHECK: 600: int3 +.fill 2, 1, 0xcc +.L50: +.nops 122 + int3 +# CHECK: 67c: int3 +.avoid_end_align 64 +.L53: +# CHECK-NOT: nop + cmp $(.L52-.L51), %eax +# CHECK-NEXT: 67d: cmpl +.L51: + je .L50 +# CHECK-NEXT: 683: je +.nops 114 +.p2align 1 + int3 + int3 +# CHECK: 6fc: int3 +.L54: +.nops 9 +.L52: + cmp $(.L53-.L54), %eax +# CHECK: 707: cmp +# note that the size of cmp is 70d-707 == 6 bytes (distance is exactly -129) + int3 +# CHECK-NEXT: 70d: int3 + +.ifdef ERR +# ERR: {{.*}}.s:[[#@LINE+1]]:17: error: unknown token in expression +.avoid_end_align +# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected absolute expression +.avoid_end_align x +# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected a positive alignment +.avoid_end_align 0 +# ERR: {{.*}}.s:[[#@LINE+1]]:20: error: unexpected token in directive +.avoid_end_align 64, 0 +.endif diff --git a/llvm/test/Transforms/JumpThreading/guard-split-debuginfo.ll b/llvm/test/Transforms/JumpThreading/guard-split-debuginfo.ll index 05ff74939449c..38fbe4de51ad2 100644 --- a/llvm/test/Transforms/JumpThreading/guard-split-debuginfo.ll +++ b/llvm/test/Transforms/JumpThreading/guard-split-debuginfo.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -S -passes=jump-threading %s -o - -S | FileCheck %s ; RUN: opt -S -passes=jump-threading %s -o - -S --try-experimental-debuginfo-iterators | FileCheck %s @@ -7,6 +7,9 @@ ; parent blocks. And that ino jump-threading, the old dbg.value gets ; deleted. +; Test that JumpThreading's threadGuard() propagates the debug location +; to the `phi` from the instruction it replaces (`%retval`) + declare void @llvm.experimental.guard(i1, ...) declare i32 @f1() @@ -20,20 +23,20 @@ define i32 @branch_implies_guard(i32 %a) !dbg !7 { ; CHECK-NEXT: br i1 [[COND]], label [[T1_SPLIT:%.*]], label [[F1_SPLIT:%.*]], !dbg [[DBG12:![0-9]+]] ; CHECK: T1.split: ; CHECK-NEXT: [[V1:%.*]] = call i32 @f1(), !dbg [[DBG12]] -; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]] +; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 0, metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]] ; CHECK-NEXT: [[RETVAL3:%.*]] = add i32 [[V1]], 10, !dbg [[DBG12]] ; CHECK-NEXT: [[CONDGUARD4:%.*]] = icmp slt i32 [[A]], 20, !dbg [[DBG12]] ; CHECK-NEXT: br label [[MERGE:%.*]] ; CHECK: F1.split: ; CHECK-NEXT: [[V2:%.*]] = call i32 @f2(), !dbg [[DBG12]] -; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata [[META13]], metadata !DIExpression()), !dbg [[DBG14]] +; CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 0, metadata [[META13]], metadata !DIExpression()), !dbg [[DBG14]] ; CHECK-NEXT: [[RETVAL1:%.*]] = add i32 [[V2]], 10, !dbg [[DBG12]] ; CHECK-NEXT: [[CONDGUARD2:%.*]] = icmp slt i32 [[A]], 20, !dbg [[DBG12]] ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[CONDGUARD2]]) [ "deopt"() ] ; CHECK-NEXT: br label [[MERGE]] ; CHECK: Merge: ; CHECK-NEXT: [[RETPHI:%.*]] = phi i32 [ [[V1]], [[T1_SPLIT]] ], [ [[V2]], [[F1_SPLIT]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[RETVAL3]], [[T1_SPLIT]] ], [ [[RETVAL1]], [[F1_SPLIT]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[RETVAL3]], [[T1_SPLIT]] ], [ [[RETVAL1]], [[F1_SPLIT]] ], !dbg [[DBG12]] ; CHECK-NEXT: ret i32 [[TMP1]], !dbg [[DBG12]] ; %cond = icmp slt i32 %a, 10 @@ -78,3 +81,22 @@ Merge: !19 = distinct !DILexicalBlock(scope: !7, file: !1, line: 8, column: 7) !26 = !DILocation(line: 13, column: 3, scope: !7) +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None) +; CHECK: [[META1]] = !DIFile(filename: "test.c", directory: {{.*}}) +; CHECK: [[META2]] = !{} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} +; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} +; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META6:![0-9]+]] = !{!""} +; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 3, type: [[META8:![0-9]+]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]]) +; CHECK: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; CHECK: [[META9]] = !{[[META10:![0-9]+]], [[META11:![0-9]+]], [[META11]]} +; CHECK: [[META10]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; CHECK: [[META11]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) +; CHECK: [[DBG12]] = !DILocation(line: 13, column: 3, scope: [[META7]]) +; CHECK: [[META13]] = !DILocalVariable(name: "bar", arg: 1, scope: [[META7]], file: [[META1]], line: 3, type: [[META11]]) +; CHECK: [[DBG14]] = !DILocation(line: 0, scope: [[META7]]) +;. diff --git a/llvm/test/Transforms/JumpThreading/preserving-debugloc-bitcast.ll b/llvm/test/Transforms/JumpThreading/preserving-debugloc-bitcast.ll new file mode 100644 index 0000000000000..319350e06f47c --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/preserving-debugloc-bitcast.ll @@ -0,0 +1,46 @@ +; RUN: opt < %s -S -passes=jump-threading | FileCheck %s + +; Test that JumpThreading's `simplifyPartiallyRedundantLoad` propagates +; the debug location to the `bitcast` from the LoadInst it replaces (`%b`). + +declare void @f1(...) + +define void @test8(ptr %0, ptr %1, ptr %2) !dbg !5 { +; CHECK: @test8 +; CHECK: [[TMP4:%.*]] = bitcast float [[A:%.*]] to i32, !dbg [[DBG9:![0-9]+]] +; CHECK: [[DBG9]] = !DILocation(line: 2, +; + %a = load float, ptr %0, align 4, !dbg !8 + %b = load i32, ptr %0, align 4, !dbg !9 + store float %a, ptr %1, align 4, !dbg !10 + %c = icmp eq i32 %b, 8, !dbg !11 + br i1 %c, label %ret1, label %ret2, !dbg !12 + +ret1: ; preds = %3 + ret void, !dbg !13 + +ret2: ; preds = %3 + %xxx = tail call i32 (...) @f1() #0, !dbg !14 + ret void, !dbg !15 +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "test.ll", directory: "/") +!2 = !{i32 8} +!3 = !{i32 0} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "test8", linkageName: "test8", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !DILocation(line: 1, column: 1, scope: !5) +!9 = !DILocation(line: 2, column: 1, scope: !5) +!10 = !DILocation(line: 3, column: 1, scope: !5) +!11 = !DILocation(line: 4, column: 1, scope: !5) +!12 = !DILocation(line: 5, column: 1, scope: !5) +!13 = !DILocation(line: 6, column: 1, scope: !5) +!14 = !DILocation(line: 7, column: 1, scope: !5) +!15 = !DILocation(line: 8, column: 1, scope: !5) diff --git a/llvm/test/Transforms/JumpThreading/preserving-debugloc-fold-select.ll b/llvm/test/Transforms/JumpThreading/preserving-debugloc-fold-select.ll new file mode 100644 index 0000000000000..8fdec7210980c --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/preserving-debugloc-fold-select.ll @@ -0,0 +1,76 @@ +; RUN: opt < %s -S -passes=jump-threading | FileCheck %s + +; Test the debug location update of the newly created PHINode +; which replaces the select instruction in .exit block. + +define i32 @unfold3(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) !dbg !5 { +; CHECK: .exit.thread4: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 {{.*}}, !dbg [[DBG29:![0-9]+]] +; CHECK-NEXT: ret i32 [[TMP0]], !dbg [[DBG30:![0-9]+]] +; +; CHECK: [[DBG29]] = !DILocation(line: 13, +; +entry: + %add3 = add nsw i32 %j, 2, !dbg !19 + %cmp.i = icmp slt i32 %u, %v, !dbg !20 + br i1 %cmp.i, label %.exit, label %cond.false.i, !dbg !21 + +cond.false.i: ; preds = %entry + %cmp4.i = icmp sgt i32 %u, %v, !dbg !22 + br i1 %cmp4.i, label %.exit, label %cond.false.6.i, !dbg !23 + +cond.false.6.i: ; preds = %cond.false.i + %cmp8.i = icmp slt i32 %w, %x, !dbg !24 + br i1 %cmp8.i, label %.exit, label %cond.false.10.i, !dbg !25 + +cond.false.10.i: ; preds = %cond.false.6.i + %cmp13.i = icmp sgt i32 %w, %x, !dbg !26 + br i1 %cmp13.i, label %.exit, label %cond.false.15.i, !dbg !27 + +cond.false.15.i: ; preds = %cond.false.10.i + %phitmp = icmp sge i32 %y, %z, !dbg !28 + br label %.exit, !dbg !29 + +.exit: ; preds = %cond.false.15.i, %cond.false.10.i, %cond.false.6.i, %cond.false.i, %entry + %cond23.i = phi i1 [ false, %entry ], [ true, %cond.false.i ], [ false, %cond.false.6.i ], [ %phitmp, %cond.false.15.i ], [ true, %cond.false.10.i ], !dbg !30 + %j.add3 = select i1 %cond23.i, i32 %j, i32 %add3, !dbg !31 + ret i32 %j.add3, !dbg !32 +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "preserving-debugloc-trytofoldselect.ll", directory: "/") +!2 = !{i32 14} +!3 = !{i32 8} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "unfold3", linkageName: "unfold3", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!9, !11, !13, !14, !15, !16, !17, !18} +!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10) +!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) +!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !12) +!12 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned) +!13 = !DILocalVariable(name: "3", scope: !5, file: !1, line: 4, type: !12) +!14 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 6, type: !12) +!15 = !DILocalVariable(name: "5", scope: !5, file: !1, line: 8, type: !12) +!16 = !DILocalVariable(name: "6", scope: !5, file: !1, line: 10, type: !12) +!17 = !DILocalVariable(name: "7", scope: !5, file: !1, line: 12, type: !12) +!18 = !DILocalVariable(name: "8", scope: !5, file: !1, line: 13, type: !10) +!19 = !DILocation(line: 1, column: 1, scope: !5) +!20 = !DILocation(line: 2, column: 1, scope: !5) +!21 = !DILocation(line: 3, column: 1, scope: !5) +!22 = !DILocation(line: 4, column: 1, scope: !5) +!23 = !DILocation(line: 5, column: 1, scope: !5) +!24 = !DILocation(line: 6, column: 1, scope: !5) +!25 = !DILocation(line: 7, column: 1, scope: !5) +!26 = !DILocation(line: 8, column: 1, scope: !5) +!27 = !DILocation(line: 9, column: 1, scope: !5) +!28 = !DILocation(line: 10, column: 1, scope: !5) +!29 = !DILocation(line: 11, column: 1, scope: !5) +!30 = !DILocation(line: 12, column: 1, scope: !5) +!31 = !DILocation(line: 13, column: 1, scope: !5) +!32 = !DILocation(line: 14, column: 1, scope: !5) diff --git a/llvm/test/Transforms/LICM/debugloc-preserve-fmul-drop-fdiv.ll b/llvm/test/Transforms/LICM/debugloc-preserve-fmul-drop-fdiv.ll new file mode 100644 index 0000000000000..e6e6f077fd6d2 --- /dev/null +++ b/llvm/test/Transforms/LICM/debugloc-preserve-fmul-drop-fdiv.ll @@ -0,0 +1,66 @@ +; RUN: opt -passes=licm -verify-memoryssa -S < %s | FileCheck %s + +; JumpThreading's hoistRegion() replaces the `fdiv` (%v6), of which the second +; operand (%v) is a loop invariant, with a loop invariant `fdiv` and a `fmul`. +; This test checks that the debug location propagates to the new `fmul` from +; the old `fdiv` it replaces in block `loop` and the debug location drop of new +; `fdiv`, which is hoisted to block `entry` after being created. + +define zeroext i1 @invariant_denom(double %v) !dbg !5 { +; CHECK: entry: +; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast double 1.000000e+00, [[V:%.*]]{{$}} +; CHECK: loop: +; CHECK: [[TMP1:%.*]] = fmul fast double {{.*}}, !dbg [[DBG12:![0-9]+]] +; CHECK: [[DBG12]] = !DILocation(line: 5, +; +entry: + br label %loop, !dbg !8 + +loop: ; preds = %loop, %entry + %v3 = phi i32 [ 0, %entry ], [ %v11, %loop ], !dbg !9 + %v4 = phi i32 [ 0, %entry ], [ %v12, %loop ], !dbg !10 + %v5 = uitofp i32 %v4 to double, !dbg !11 + %v6 = fdiv fast double %v5, %v, !dbg !12 + %v7 = fptoui double %v6 to i64, !dbg !13 + %v8 = and i64 %v7, 1, !dbg !14 + %v9 = xor i64 %v8, 1, !dbg !15 + %v10 = trunc i64 %v9 to i32, !dbg !16 + %v11 = add i32 %v10, %v3, !dbg !17 + %v12 = add nuw i32 %v4, 1, !dbg !18 + %v13 = icmp eq i32 %v12, -1, !dbg !19 + br i1 %v13, label %end, label %loop, !dbg !20 + +end: ; preds = %loop + %v15 = phi i32 [ %v11, %loop ], !dbg !21 + %v16 = icmp ne i32 %v15, 0, !dbg !22 + ret i1 %v16, !dbg !23 +} + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "test.ll", directory: "/") +!2 = !{i32 16} +!3 = !{i32 0} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "invariant_denom", linkageName: "invariant_denom", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !DILocation(line: 1, column: 1, scope: !5) +!9 = !DILocation(line: 2, column: 1, scope: !5) +!10 = !DILocation(line: 3, column: 1, scope: !5) +!11 = !DILocation(line: 4, column: 1, scope: !5) +!12 = !DILocation(line: 5, column: 1, scope: !5) +!13 = !DILocation(line: 6, column: 1, scope: !5) +!14 = !DILocation(line: 7, column: 1, scope: !5) +!15 = !DILocation(line: 8, column: 1, scope: !5) +!16 = !DILocation(line: 9, column: 1, scope: !5) +!17 = !DILocation(line: 10, column: 1, scope: !5) +!18 = !DILocation(line: 11, column: 1, scope: !5) +!19 = !DILocation(line: 12, column: 1, scope: !5) +!20 = !DILocation(line: 13, column: 1, scope: !5) +!21 = !DILocation(line: 14, column: 1, scope: !5) +!22 = !DILocation(line: 15, column: 1, scope: !5) +!23 = !DILocation(line: 16, column: 1, scope: !5) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll index 9969f881063c3..6b4cfa091c45e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -5,8 +5,8 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-COST: Checking a loop in 'fixed_width' -; CHECK-COST: Found an estimated cost of 14 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4 -; CHECK-COST: Found an estimated cost of 28 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4 +; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4 +; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4 ; CHECK-COST: Selecting VF: 1. ; We should decide this loop is not worth vectorising using fixed width vectors diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index 0fb634fce135d..b58f92d709361 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -102,11 +102,7 @@ define <8 x half> @fneg(<8 x half> %a, <8 x half> %b) { define <8 x i8> @abs(<8 x i8> %a) { ; CHECK-LABEL: @abs( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[ABT:%.*]] = call <4 x i8> @llvm.abs.v4i8(<4 x i8> [[AT]], i1 false) -; CHECK-NEXT: [[ABB:%.*]] = call <4 x i8> @llvm.abs.v4i8(<4 x i8> [[AB]], i1 false) -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> +; CHECK-NEXT: [[R:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[A:%.*]], i1 false) ; CHECK-NEXT: ret <8 x i8> [[R]] ; %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> @@ -119,11 +115,7 @@ define <8 x i8> @abs(<8 x i8> %a) { define <8 x half> @powi(<8 x half> %a) { ; CHECK-LABEL: @powi( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[ABT:%.*]] = call <4 x half> @llvm.powi.v4f16.i32(<4 x half> [[AT]], i32 10) -; CHECK-NEXT: [[ABB:%.*]] = call <4 x half> @llvm.powi.v4f16.i32(<4 x half> [[AB]], i32 10) -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> +; CHECK-NEXT: [[R:%.*]] = call <8 x half> @llvm.powi.v8f16.i32(<8 x half> [[A:%.*]], i32 10) ; CHECK-NEXT: ret <8 x half> [[R]] ; %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> @@ -171,11 +163,7 @@ declare <4 x half> @othercall(<4 x half>) define <8 x i32> @lrint(<8 x half> %a) { ; CHECK-LABEL: @lrint( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[ABT:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f16(<4 x half> [[AT]]) -; CHECK-NEXT: [[ABB:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f16(<4 x half> [[AB]]) -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[ABT]], <4 x i32> [[ABB]], <8 x i32> +; CHECK-NEXT: [[R:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f16(<8 x half> [[A:%.*]]) ; CHECK-NEXT: ret <8 x i32> [[R]] ; %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> @@ -427,15 +415,7 @@ define <8 x i8> @icmpsel(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { define <8 x half> @fma(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; CHECK-LABEL: @fma( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x half> [[B:%.*]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[CB:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[CT:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[ABB:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[AB]], <4 x half> [[BB]], <4 x half> [[CB]]) -; CHECK-NEXT: [[ABT:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[AT]], <4 x half> [[BT]], <4 x half> [[CT]]) -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x half> [[ABT]], <4 x half> [[ABB]], <8 x i32> +; CHECK-NEXT: [[R:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]]) ; CHECK-NEXT: ret <8 x half> [[R]] ; %ab = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> @@ -487,19 +467,10 @@ define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) { define <8 x i8> @intrinsics_minmax(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: @intrinsics_minmax( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[ABT:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[AT]], <4 x i8> [[BT]]) -; CHECK-NEXT: [[ABB:%.*]] = call <4 x i8> @llvm.smin.v4i8(<4 x i8> [[AB]], <4 x i8> [[BB]]) -; CHECK-NEXT: [[ABT1:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[ABT]], <4 x i8> [[BT]]) -; CHECK-NEXT: [[ABB1:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[ABB]], <4 x i8> [[BB]]) -; CHECK-NEXT: [[ABT2:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[ABT1]], <4 x i8> [[BT]]) -; CHECK-NEXT: [[ABB2:%.*]] = call <4 x i8> @llvm.umin.v4i8(<4 x i8> [[ABB1]], <4 x i8> [[BB]]) -; CHECK-NEXT: [[ABT3:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[ABT2]], <4 x i8> [[BT]]) -; CHECK-NEXT: [[ABB3:%.*]] = call <4 x i8> @llvm.umax.v4i8(<4 x i8> [[ABB2]], <4 x i8> [[BB]]) -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT3]], <4 x i8> [[ABB3]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.smin.v8i8(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[B]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[TMP2]], <8 x i8> [[B]]) +; CHECK-NEXT: [[R:%.*]] = call <8 x i8> @llvm.umax.v8i8(<8 x i8> [[TMP3]], <8 x i8> [[B]]) ; CHECK-NEXT: ret <8 x i8> [[R]] ; %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> @@ -520,19 +491,10 @@ define <8 x i8> @intrinsics_minmax(<8 x i8> %a, <8 x i8> %b) { define <8 x i8> @intrinsics_addsat(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: @intrinsics_addsat( -; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> -; CHECK-NEXT: [[ABT:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[AT]], <4 x i8> [[BT]]) -; CHECK-NEXT: [[ABB:%.*]] = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> [[AB]], <4 x i8> [[BB]]) -; CHECK-NEXT: [[ABT1:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[ABT]], <4 x i8> [[BT]]) -; CHECK-NEXT: [[ABB1:%.*]] = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> [[ABB]], <4 x i8> [[BB]]) -; CHECK-NEXT: [[ABT2:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ABT1]], <4 x i8> [[BT]]) -; CHECK-NEXT: [[ABB2:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ABB1]], <4 x i8> [[BB]]) -; CHECK-NEXT: [[ABT3:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ABT2]], <4 x i8> [[BT]]) -; CHECK-NEXT: [[ABB3:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ABB2]], <4 x i8> [[BB]]) -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT3]], <4 x i8> [[ABB3]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[B]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[TMP2]], <8 x i8> [[B]]) +; CHECK-NEXT: [[R:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[TMP3]], <8 x i8> [[B]]) ; CHECK-NEXT: ret <8 x i8> [[R]] ; %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index ec4f5c74498f8..06966b1883586 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -58,8 +58,8 @@ dumpDXContainer(MemoryBufferRef Source) { assert(DXIL && "Since we are iterating and found a DXIL part, " "this should never not have a value"); NewPart.Program = DXContainerYAML::DXILProgram{ - DXIL->first.MajorVersion, - DXIL->first.MinorVersion, + DXIL->first.getMajorVersion(), + DXIL->first.getMinorVersion(), DXIL->first.ShaderKind, DXIL->first.Size, DXIL->first.Bitcode.MajorVersion, diff --git a/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp index 912974fa23bdb..57828a728931d 100644 --- a/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp +++ b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp @@ -7,9 +7,8 @@ //===----------------------------------------------------------------------===// #include "AMDGPUHSAMetadataStreamer.h" -#include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" #include "SIProgramInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -25,9 +24,8 @@ using namespace llvm; class SIProgramInfoMCExprsTest : public testing::Test { protected: - std::unique_ptr TM; + std::unique_ptr TM; std::unique_ptr Ctx; - std::unique_ptr ST; std::unique_ptr MMI; std::unique_ptr MF; std::unique_ptr M; @@ -49,7 +47,7 @@ class SIProgramInfoMCExprsTest : public testing::Test { const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error); TargetOptions Options; - TM.reset(static_cast(TheTarget->createTargetMachine( + TM.reset(static_cast(TheTarget->createTargetMachine( Triple, CPU, FS, Options, std::nullopt, std::nullopt))); Ctx = std::make_unique(); @@ -59,9 +57,7 @@ class SIProgramInfoMCExprsTest : public testing::Test { auto *F = Function::Create(FType, GlobalValue::ExternalLinkage, "Test", *M); MMI = std::make_unique(TM.get()); - ST = std::make_unique(TM->getTargetTriple(), - TM->getTargetCPU(), - TM->getTargetFeatureString(), *TM); + auto *ST = TM->getSubtargetImpl(*F); MF = std::make_unique(*F, *TM, *ST, 1, *MMI); MF->initTargetMachineFunctionInfo(*ST); diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index da640225617d4..115de47a3cefd 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -174,6 +174,49 @@ TEST(DXCFile, ParseEmptyParts) { } } +// This test verify DXIL part are correctly parsed. +// This test is based on the binary output constructed from this yaml. +// --- !dxcontainer +// Header: +// Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, +// 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] +// Version: +// Major: 1 +// Minor: 0 +// PartCount: 1 +// Parts: +// - Name: DXIL +// Size: 28 +// Program: +// MajorVersion: 6 +// MinorVersion: 5 +// ShaderKind: 5 +// Size: 8 +// DXILMajorVersion: 1 +// DXILMinorVersion: 5 +// DXILSize: 4 +// DXIL: [ 0x42, 0x43, 0xC0, 0xDE, ] +// ... +TEST(DXCFile, ParseDXILPart) { + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x44, 0x58, 0x49, 0x4c, 0x1c, 0x00, 0x00, 0x00, 0x65, 0x00, 0x05, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x44, 0x58, 0x49, 0x4c, 0x05, 0x01, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x42, 0x43, 0xc0, 0xde}; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<116>(Buffer))); + EXPECT_EQ(C.getHeader().PartCount, 1u); + const std::optional &DXIL = C.getDXIL(); + EXPECT_TRUE(DXIL.has_value()); + dxbc::ProgramHeader Header = DXIL->first; + EXPECT_EQ(Header.getMajorVersion(), 6u); + EXPECT_EQ(Header.getMinorVersion(), 5u); + EXPECT_EQ(Header.ShaderKind, 5u); + EXPECT_EQ(Header.Size, 8u); +} + static Expected generateDXContainer(StringRef Yaml, SmallVectorImpl &BinaryData) { DXContainerYAML::Object Obj; diff --git a/mlir/include/mlir-c/Dialect/IRDL.h b/mlir/include/mlir-c/Dialect/IRDL.h new file mode 100644 index 0000000000000..c4d6ffd989af9 --- /dev/null +++ b/mlir/include/mlir-c/Dialect/IRDL.h @@ -0,0 +1,29 @@ +//===-- mlir-c/Dialect/IRDL.h - C API for IRDL --------------------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_C_DIALECT_IRDL_H +#define MLIR_C_DIALECT_IRDL_H + +#include "mlir-c/IR.h" + +#ifdef __cplusplus +extern "C" { +#endif + +MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(IRDL, irdl); + +/// Loads all IRDL dialects in the provided module, registering the dialects in +/// the module's associated context. +MLIR_CAPI_EXPORTED MlirLogicalResult mlirLoadIRDLDialects(MlirModule module); + +#ifdef __cplusplus +} +#endif + +#endif // MLIR_C_DIALECT_IRDL_H diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h index c76cfac07fc77..2580ec28b5190 100644 --- a/mlir/include/mlir/Analysis/DataFlowFramework.h +++ b/mlir/include/mlir/Analysis/DataFlowFramework.h @@ -242,6 +242,17 @@ class DataFlowSolver { return static_cast(it->second.get()); } + /// Erase any analysis state associated with the given program point. + template + void eraseState(PointT point) { + ProgramPoint pp(point); + + for (auto it = analysisStates.begin(); it != analysisStates.end(); ++it) { + if (it->first.first == pp) + analysisStates.erase(it); + } + } + /// Get a uniqued program point instance. If one is not present, it is /// created with the provided arguments. template diff --git a/mlir/lib/CAPI/Dialect/CMakeLists.txt b/mlir/lib/CAPI/Dialect/CMakeLists.txt index 58b8739043f9d..4e141b60ff8cc 100644 --- a/mlir/lib/CAPI/Dialect/CMakeLists.txt +++ b/mlir/lib/CAPI/Dialect/CMakeLists.txt @@ -72,6 +72,15 @@ add_mlir_upstream_c_api_library(MLIRCAPIGPU MLIRPass ) +add_mlir_upstream_c_api_library(MLIRCAPIIRDL + IRDL.cpp + + PARTIAL_SOURCES_INTENDED + LINK_LIBS PUBLIC + MLIRCAPIIR + MLIRIRDL +) + add_mlir_upstream_c_api_library(MLIRCAPILLVM LLVM.cpp diff --git a/mlir/lib/CAPI/Dialect/IRDL.cpp b/mlir/lib/CAPI/Dialect/IRDL.cpp new file mode 100644 index 0000000000000..cb9dc8ceb6795 --- /dev/null +++ b/mlir/lib/CAPI/Dialect/IRDL.cpp @@ -0,0 +1,18 @@ +//===- IRDL.cpp - C Interface for IRDL dialect ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir-c/Dialect/IRDL.h" +#include "mlir/CAPI/Registration.h" +#include "mlir/Dialect/IRDL/IR/IRDL.h" +#include "mlir/Dialect/IRDL/IRDLLoading.h" + +MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(IRDL, irdl, mlir::irdl::IRDLDialect) + +MlirLogicalResult mlirLoadIRDLDialects(MlirModule module) { + return wrap(mlir::irdl::loadDialects(unwrap(module))); +} diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp index cd6da35582469..89f956a5e7017 100644 --- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp +++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp @@ -55,9 +55,8 @@ TensorType inferReshapeExpandedType(TensorType inputType, // Check if the input is static, and if so, get its total size bool inputIsStatic = inputType.hasStaticShape(); int64_t totalSize = inputIsStatic ? inputType.getNumElements() : -1; - + // Compute result shape - bool resultIsStatic = true; auto resultShape = llvm::map_to_vector(newShape, [&](int64_t size) -> int64_t { // If this is not a placeholder, do not change it if (size >= 0) @@ -65,10 +64,8 @@ TensorType inferReshapeExpandedType(TensorType inputType, // If we do not know the total size of the tensor, keep this dimension // dynamic in the result shape. - if (!inputIsStatic) { - resultIsStatic = false; + if (!inputIsStatic) return ShapedType::kDynamic; - } // Calculate the product of all elements in 'newShape' except for the -1 // placeholder, which we discard by negating the result. @@ -84,12 +81,14 @@ TensorType inferReshapeExpandedType(TensorType inputType, return totalSize / totalSizeNoPlaceholder; }); + bool resultIsStatic = !ShapedType::isDynamicShape(resultShape); + // A syntactic restriction in 'tensor.expand_shape' forbids a dynamically // shaped input from being reshaped into a statically shaped result. We may // simply turn the first result dimension dynamic to address this. if (!inputIsStatic && resultIsStatic) resultShape[0] = ShapedType::kDynamic; - + // The 'tensor.expand_shape' op also forbids a statically shaped input from // being reshaped into a dynamically shaped result, but the placeholder // inference algorithm above guarantees that this will never be the case. diff --git a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp index 92cad7cd1ef26..2473169962b95 100644 --- a/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/IntRangeOptimizations.cpp @@ -102,6 +102,24 @@ static FailureOr handleUge(ConstantIntRanges lhs, ConstantIntRanges rhs) { } namespace { +/// This class listens on IR transformations performed during a pass relying on +/// information from a `DataflowSolver`. It erases state associated with the +/// erased operation and its results from the `DataFlowSolver` so that Patterns +/// do not accidentally query old state information for newly created Ops. +class DataFlowListener : public RewriterBase::Listener { +public: + DataFlowListener(DataFlowSolver &s) : s(s) {} + +protected: + void notifyOperationErased(Operation *op) override { + s.eraseState(op); + for (Value res : op->getResults()) + s.eraseState(res); + } + + DataFlowSolver &s; +}; + struct ConvertCmpOp : public OpRewritePattern { ConvertCmpOp(MLIRContext *context, DataFlowSolver &s) @@ -167,10 +185,15 @@ struct IntRangeOptimizationsPass if (failed(solver.initializeAndRun(op))) return signalPassFailure(); + DataFlowListener listener(solver); + RewritePatternSet patterns(ctx); populateIntRangeOptimizationsPatterns(patterns, solver); - if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) + GreedyRewriteConfig config; + config.listener = &listener; + + if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config))) signalPassFailure(); } }; diff --git a/mlir/test/CAPI/CMakeLists.txt b/mlir/test/CAPI/CMakeLists.txt index b9cd63ef7c673..57b342a5e26b2 100644 --- a/mlir/test/CAPI/CMakeLists.txt +++ b/mlir/test/CAPI/CMakeLists.txt @@ -38,6 +38,13 @@ _add_capi_test_executable(mlir-capi-ir-test MLIRCAPIRegisterEverything ) +_add_capi_test_executable(mlir-capi-irdl-test + irdl.c + LINK_LIBS PRIVATE + MLIRCAPIIR + MLIRCAPIIRDL +) + _add_capi_test_executable(mlir-capi-llvm-test llvm.c LINK_LIBS PRIVATE diff --git a/mlir/test/CAPI/irdl.c b/mlir/test/CAPI/irdl.c new file mode 100644 index 0000000000000..b35345b664b14 --- /dev/null +++ b/mlir/test/CAPI/irdl.c @@ -0,0 +1,58 @@ +//===- irdl.c - Test for the C bindings for IRDL registration -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +/* RUN: mlir-capi-irdl-test 2>&1 | FileCheck %s + */ + +#include "mlir-c/Dialect/IRDL.h" +#include "mlir-c/IR.h" + +const char irdlDialect[] = "\ + irdl.dialect @foo {\ + irdl.operation @op {\ + %i32 = irdl.is i32\ + irdl.results(%i32)\ + }\ + }\ + irdl.dialect @bar {\ + irdl.operation @op {\ + %i32 = irdl.is i32\ + irdl.operands(%i32)\ + }\ + }"; + +// CHECK: module { +// CHECK-NEXT: %[[RES:.*]] = "foo.op"() : () -> i32 +// CHECK-NEXT: "bar.op"(%[[RES]]) : (i32) -> () +// CHECK-NEXT: } +const char newDialectUsage[] = "\ + module {\ + %res = \"foo.op\"() : () -> i32\ + \"bar.op\"(%res) : (i32) -> ()\ + }"; + +int main(void) { + MlirContext ctx = mlirContextCreate(); + mlirDialectHandleLoadDialect(mlirGetDialectHandle__irdl__(), ctx); + + MlirModule dialectDecl = + mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(irdlDialect)); + + mlirLoadIRDLDialects(dialectDecl); + mlirModuleDestroy(dialectDecl); + + MlirModule usingModule = mlirModuleCreateParse( + ctx, mlirStringRefCreateFromCString(newDialectUsage)); + + mlirOperationDump(mlirModuleGetOperation(usingModule)); + + mlirModuleDestroy(usingModule); + mlirContextDestroy(ctx); + return 0; +} diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 5319a9cb33e00..8806a1dd92238 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -101,6 +101,7 @@ configure_lit_site_cfg( set(MLIR_TEST_DEPENDS FileCheck count not split-file mlir-capi-ir-test + mlir-capi-irdl-test mlir-capi-llvm-test mlir-capi-pass-test mlir-capi-quant-test diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir index b8c3d56f21f10..72e7e4cc84088 100644 --- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir +++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir @@ -394,6 +394,21 @@ func.func @test_reshape_6d_down_s2s_auto(%arg0: tensor<1x2x3x5x7x11xf32>) -> ten // ----- +// This test would previously fail on GCC with certain compiler flags. +// The GCC issue would cause invalid IR after tosa-to-tensor, so this test +// locks down that the code goes through tosa-to-tensor and verifies. +// +// See https://github.com/llvm/llvm-project/pull/91521 for a full description. + +// CHECK-LABEL: reshape_bug_fix +// CHECK: tensor.expand_shape +func.func @reshape_bug_fix(%arg0: tensor) -> tensor<1x1x1x?xf32> { + %0 = tosa.reshape %arg0 {new_shape = array} : (tensor) -> tensor<1x1x1x?xf32> + return %0 : tensor<1x1x1x?xf32> +} + +// ----- + // CHECK-LABEL: test_reshape_6d_down_s2s_explicit // CHECK-SAME: %[[ARG_0:[a-zA-Z0-9_]+]]: tensor<1x2x3x5x7x11xf32> // CHECK: %[[VAL_0:.*]] = tensor.collapse_shape %[[ARG_0]] {{\[\[}}0, 1, 2], [3], [4, 5]] : tensor<1x2x3x5x7x11xf32> into tensor<6x5x77xf32> diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 4740e7d137e8d..ea6d9ae71b773 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -100,6 +100,7 @@ def add_runtime(name): "mlir-lsp-server", "mlir-capi-execution-engine-test", "mlir-capi-ir-test", + "mlir-capi-irdl-test", "mlir-capi-llvm-test", "mlir-capi-pass-test", "mlir-capi-pdl-test", From 972047a832f4fe67170f08f4f75bd3c0bb614021 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Sun, 12 May 2024 17:37:48 -0700 Subject: [PATCH 2/3] fix test Created using spr 1.3.4 --- bolt/test/X86/bb-with-two-tail-calls.s | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bolt/test/X86/bb-with-two-tail-calls.s b/bolt/test/X86/bb-with-two-tail-calls.s index c0b01e1894a49..8ea719262d155 100644 --- a/bolt/test/X86/bb-with-two-tail-calls.s +++ b/bolt/test/X86/bb-with-two-tail-calls.s @@ -10,8 +10,9 @@ # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib # RUN: llvm-bolt %t.exe -o %t.out --data %t.fdata --lite=0 --dyno-stats \ # RUN: --print-sctc --print-only=_start -enable-bat 2>&1 | FileCheck %s -# RUN: llvm-objdump --syms %t.out | FileCheck %s --check-prefix=CHECK-NM -# RUN: llvm-bat-dump %t.out --dump-all | FileCheck %s --check-prefix=CHECK-BAT +# RUN: llvm-objdump --syms %t.out > %t.log +# RUN: llvm-bat-dump %t.out --dump-all >> %t.log +# RUN: FileCheck %s --input-file %t.log --check-prefix=CHECK-BAT # CHECK-NOT: Assertion `BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"' failed. # Two tail calls in the same basic block after SCTC: @@ -19,9 +20,9 @@ # CHECK-NEXT: {{.*}}: jmp {{.*}} # TAILCALL # Offset: 12 # Confirm that a deleted basic block is emitted at function end offset (0xe) -# CHECK-NM: 0000000000600000 g .text 000000000000000e _start -# CHECK-BAT: Function Address: 0x600000, hash: 0xf8bf620b266cdc1b -# CHECK-BAT: 0xe -> 0xc hash: 0x823623240f000c +# CHECK-BAT: [[#%x,ADDR:]] g .text [[#%x,SIZE:]] _start +# CHECK-BAT: Function Address: 0x[[#%x,ADDR]] +# CHECK-BAT: 0x[[#%x,SIZE]] # CHECK-BAT: NumBlocks: 5 .globl _start From 8cf2305cbd12d6667b586fdb8125bc1310fce0be Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Sun, 12 May 2024 17:46:54 -0700 Subject: [PATCH 3/3] updated tests Created using spr 1.3.4 --- bolt/test/X86/bolt-address-translation-yaml.test | 2 +- bolt/test/X86/bolt-address-translation.test | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test index e21513b7dfe59..e28ca5c6c3488 100644 --- a/bolt/test/X86/bolt-address-translation-yaml.test +++ b/bolt/test/X86/bolt-address-translation-yaml.test @@ -40,7 +40,7 @@ RUN: | FileCheck --check-prefix CHECK-BOLT-YAML %s WRITE-BAT-CHECK: BOLT-INFO: Wrote 5 BAT maps WRITE-BAT-CHECK: BOLT-INFO: Wrote 4 function and 22 basic block hashes -WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 384 +WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 404 READ-BAT-CHECK-NOT: BOLT-ERROR: unable to save profile in YAML format for input file processed by BOLT READ-BAT-CHECK: BOLT-INFO: Parsed 5 BAT entries diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test index e6b21c14077b4..dfdd1eea32333 100644 --- a/bolt/test/X86/bolt-address-translation.test +++ b/bolt/test/X86/bolt-address-translation.test @@ -37,7 +37,7 @@ # CHECK: BOLT: 3 out of 7 functions were overwritten. # CHECK: BOLT-INFO: Wrote 6 BAT maps # CHECK: BOLT-INFO: Wrote 3 function and 58 basic block hashes -# CHECK: BOLT-INFO: BAT section size (bytes): 928 +# CHECK: BOLT-INFO: BAT section size (bytes): 940 # # usqrt mappings (hot part). We match against any key (left side containing # the bolted binary offsets) because BOLT may change where it puts instructions