From 94ffb45c9657530a9d1530b51b5f833946d65b68 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@meta.com>
Date: Mon, 22 Jul 2024 09:19:15 -0700
Subject: [PATCH 01/42] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20?=
 =?UTF-8?q?changes=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.4

[skip ci]
---
 clang/docs/ReleaseNotes.rst                   |   2 -
 clang/include/clang/Basic/Attr.td             |   2 +-
 clang/include/clang/Basic/AttrDocs.td         |   3 +-
 .../include/clang/Basic/DiagnosticLexKinds.td |   5 -
 .../clang/Basic/DiagnosticSemaKinds.td        |   2 +-
 clang/include/clang/Basic/IdentifierTable.h   |  24 +-
 clang/include/clang/Basic/TokenKinds.def      |   3 -
 clang/include/clang/Driver/Options.td         |   4 +
 clang/include/clang/Lex/Preprocessor.h        |  83 +---
 clang/include/clang/Lex/Token.h               |   3 -
 clang/include/clang/Parse/Parser.h            |   2 +-
 clang/lib/AST/Interp/Compiler.cpp             |   4 +-
 clang/lib/AST/Interp/Interp.h                 |   9 +
 clang/lib/AST/Interp/Opcodes.td               |   2 +
 clang/lib/Basic/IdentifierTable.cpp           |   3 +-
 clang/lib/CodeGen/CGExprConstant.cpp          |   3 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   4 +-
 clang/lib/Driver/ToolChains/Gnu.cpp           |  29 ++
 .../lib/Frontend/PrintPreprocessedOutput.cpp  |  12 +-
 clang/lib/Lex/PPLexerChange.cpp               |   9 +-
 clang/lib/Lex/Preprocessor.cpp                | 444 ++++++------------
 clang/lib/Lex/TokenConcatenation.cpp          |  10 -
 clang/lib/Parse/ParseDecl.cpp                 |   8 +-
 clang/lib/Parse/Parser.cpp                    |  93 ++--
 clang/lib/Sema/SemaDeclAttr.cpp               |   4 +
 clang/test/CXX/cpp/cpp.module/p2.cppm         |  88 ----
 .../basic/basic.link/module-declaration.cpp   |  61 ++-
 .../dcl.module/dcl.module.import/p1.cppm      |  39 +-
 .../ptrauth-global-constant-initializers.cpp  | 234 +++++++++
 clang/test/Driver/fpatchable-function-entry.c |   9 +-
 .../Sema/patchable-function-entry-attr.cpp    |   5 +
 clang/test/SemaCXX/builtin_vectorelements.cpp |   2 +
 clang/test/SemaCXX/modules.cppm               |  89 ++--
 clang/www/cxx_status.html                     |   2 +-
 .../sanitizer_common_interceptors.inc         |   2 +-
 .../sanitizer_procmaps_bsd.cpp                |   3 +-
 .../sanitizer_stacktrace_printer.cpp          |   4 +-
 cross-project-tests/lit.cfg.py                |  14 +-
 cross-project-tests/lit.site.cfg.py.in        |   4 +
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp      |   5 +-
 libc/benchmarks/gpu/LibcGpuBenchmark.h        |  26 +-
 libc/config/linux/riscv/entrypoints.txt       |   6 +
 libc/config/linux/riscv/headers.txt           |   1 +
 libc/src/__support/FPUtil/BasicOperations.h   |  13 +-
 libc/src/__support/FPUtil/CMakeLists.txt      |  57 +--
 libc/src/__support/FPUtil/generic/FMA.h       |  45 +-
 libc/src/__support/FPUtil/generic/add_sub.h   |  26 +-
 libc/src/__support/FPUtil/generic/div.h       |  20 +-
 libc/src/__support/FPUtil/generic/mul.h       |  20 +-
 libc/test/src/math/smoke/AddTest.h            |  19 +-
 libc/test/src/math/smoke/DivTest.h            |  19 +-
 libc/test/src/math/smoke/MulTest.h            |  19 +-
 libc/test/src/math/smoke/SubTest.h            |  19 +-
 .../src/sys/statvfs/linux/fstatvfs_test.cpp   |  12 +-
 .../src/sys/statvfs/linux/statvfs_test.cpp    |  10 +-
 lld/ELF/Arch/RISCV.cpp                        |   3 +
 lld/MachO/Writer.cpp                          |  35 +-
 lldb/test/API/lit.cfg.py                      |   5 +
 lldb/test/API/lit.site.cfg.py.in              |   8 +
 lldb/test/Shell/helper/toolchain.py           |   5 +
 lldb/test/Shell/lit.site.cfg.py.in            |   9 +
 llvm/CMakeLists.txt                           |   4 +
 llvm/include/llvm/Analysis/ValueTracking.h    |  26 +-
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   6 +
 .../ExecutionEngine/Orc/Shared/MemoryFlags.h  |   2 +-
 llvm/include/llvm/Support/raw_socket_stream.h |  28 +-
 .../llvm/TargetParser/RISCVTargetParser.h     |   8 +
 llvm/lib/Analysis/ValueTracking.cpp           |  14 +-
 llvm/lib/CodeGen/BranchFolding.cpp            |   9 +-
 llvm/lib/Support/raw_socket_stream.cpp        | 132 ++++--
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |  14 +-
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |   2 +-
 .../LoongArch/LoongArchISelLowering.cpp       | 166 +++++++
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |  27 +-
 llvm/lib/Target/RISCV/RISCVFeatures.td        | 134 ++++--
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  11 +-
 llvm/lib/TargetParser/RISCVTargetParser.cpp   |  16 +
 .../InstCombine/InstCombineSelect.cpp         |   7 +-
 .../Scalar/StraightLineStrengthReduce.cpp     |   6 +-
 .../test/Analysis/CostModel/RISCV/arith-fp.ll | 300 ++++++------
 .../CostModel/RISCV/rvv-intrinsics.ll         | 130 ++---
 llvm/test/CodeGen/LoongArch/andn-icmp.ll      |  57 +--
 .../PowerPC/patchable-function-entry.ll       |  58 +++
 llvm/test/TableGen/riscv-target-def.td        |  18 +-
 .../memoryssa-scan-limit.ll                   |  45 ++
 .../InstCombine/select-binop-cmp.ll           |  13 +
 llvm/test/Transforms/InstCombine/select.ll    |  16 +
 .../RISCV/masked_gather_scatter.ll            |  84 ++--
 .../RISCV/riscv-vector-reverse.ll             | 206 +++++---
 .../RISCV/vpintrin-scalarization.ll           | 100 ++--
 .../ExecutionEngine/Orc/CMakeLists.txt        |   1 +
 .../ExecutionEngine/Orc/MemoryFlagsTest.cpp   |  55 +++
 .../Support/raw_socket_stream_test.cpp        |  45 +-
 llvm/utils/TableGen/RISCVTargetDefEmitter.cpp |  37 ++
 mlir/docs/Canonicalization.md                 |  39 +-
 .../Transforms/Utils/DialectConversion.cpp    |  11 +-
 96 files changed, 2078 insertions(+), 1454 deletions(-)
 delete mode 100644 clang/test/CXX/cpp/cpp.module/p2.cppm
 create mode 100644 clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
 create mode 100644 llvm/test/CodeGen/PowerPC/patchable-function-entry.ll
 create mode 100644 llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4ab6bd9de8ea9..7ac6ed934290d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -285,8 +285,6 @@ C++2c Feature Support
 
 - Implemented `P2963R3 Ordering of constraints involving fold expressions <https://wg21.link/P2963R3>`_.
 
-- Implemented `P3034R1 Module Declarations Shouldn’t be Macros <https://wg21.link/P3034R1>`_.
-
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 1293d0ddbc117..4825979a974d2 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -902,7 +902,7 @@ def PatchableFunctionEntry
     : InheritableAttr,
       TargetSpecificAttr<TargetArch<
           ["aarch64", "aarch64_be", "loongarch32", "loongarch64", "riscv32",
-           "riscv64", "x86", "x86_64"]>> {
+           "riscv64", "x86", "x86_64", "ppc", "ppc64"]>> {
   let Spellings = [GCC<"patchable_function_entry">];
   let Subjects = SubjectList<[Function, ObjCMethod]>;
   let Args = [UnsignedArgument<"Count">, DefaultIntArgument<"Offset", 0>];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 09cf4f80bd999..99738812c8157 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -5800,7 +5800,8 @@ takes precedence over the command line option ``-fpatchable-function-entry=N,M``
 ``M`` defaults to 0 if omitted.
 
 This attribute is only supported on
-aarch64/aarch64-be/loongarch32/loongarch64/riscv32/riscv64/i386/x86-64 targets.
+aarch64/aarch64-be/loongarch32/loongarch64/riscv32/riscv64/i386/x86-64/ppc/ppc64 targets.
+For ppc/ppc64 targets, AIX is still not supported.
 }];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 08ece01009387..12d7b8c0205ee 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -952,11 +952,6 @@ def warn_module_conflict : Warning<
   InGroup<ModuleConflict>;
 
 // C++20 modules
-def err_module_decl_cannot_be_macros : Error<
-  "the module name in a module%select{| partition}0 declaration cannot contain "
-  "an object-like macro %1">;
-def err_unxepected_paren_in_module_decl : Error<
-  "unexpected '(' after the module name in a module%select{| partition}0 declaration">;
 def err_header_import_semi_in_macro : Error<
   "semicolon terminating header import declaration cannot be produced "
   "by a macro">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index d60f32674ca3a..dd9dc912ef4ca 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3492,7 +3492,7 @@ def err_attr_tlsmodel_arg : Error<"tls_model must be \"global-dynamic\", "
 
 def err_attr_codemodel_arg : Error<"code model '%0' is not supported on this target">;
 
-def err_aix_attr_unsupported_tls_model : Error<"TLS model '%0' is not yet supported on AIX">;
+def err_aix_attr_unsupported : Error<"%0 attribute is not yet supported on AIX">;
 
 def err_tls_var_aligned_over_maximum : Error<
   "alignment (%0) of thread-local variable %1 is greater than the maximum supported "
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index f40f74d0355ad..ae9ebd9f59154 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -180,10 +180,6 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsModulesImport : 1;
 
-  // True if this is the 'module' contextual keyword.
-  LLVM_PREFERRED_TYPE(bool)
-  unsigned IsModulesDecl : 1;
-
   // True if this is a mangled OpenMP variant name.
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsMangledOpenMPVariantName : 1;
@@ -200,7 +196,7 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsFinal : 1;
 
-  // 21 bits left in a 64-bit word.
+  // 22 bits left in a 64-bit word.
 
   // Managed by the language front-end.
   void *FETokenInfo = nullptr;
@@ -216,8 +212,8 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
         IsCPPOperatorKeyword(false), NeedsHandleIdentifier(false),
         IsFromAST(false), ChangedAfterLoad(false), FEChangedAfterLoad(false),
         RevertedTokenID(false), OutOfDate(false), IsModulesImport(false),
-        IsModulesDecl(false), IsMangledOpenMPVariantName(false),
-        IsDeprecatedMacro(false), IsRestrictExpansion(false), IsFinal(false) {}
+        IsMangledOpenMPVariantName(false), IsDeprecatedMacro(false),
+        IsRestrictExpansion(false), IsFinal(false) {}
 
 public:
   IdentifierInfo(const IdentifierInfo &) = delete;
@@ -524,18 +520,6 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
       RecomputeNeedsHandleIdentifier();
   }
 
-  /// Determine whether this is the contextual keyword \c module.
-  bool isModulesDeclaration() const { return IsModulesDecl; }
-
-  /// Set whether this identifier is the contextual keyword \c module.
-  void setModulesDeclaration(bool I) {
-    IsModulesDecl = I;
-    if (I)
-      NeedsHandleIdentifier = true;
-    else
-      RecomputeNeedsHandleIdentifier();
-  }
-
   /// Determine whether this is the mangled name of an OpenMP variant.
   bool isMangledOpenMPVariantName() const { return IsMangledOpenMPVariantName; }
 
@@ -756,8 +740,6 @@ class IdentifierTable {
     // If this is the 'import' contextual keyword, mark it as such.
     if (Name == "import")
       II->setModulesImport(true);
-    else if (Name == "module")
-      II->setModulesDeclaration(true);
 
     return *II;
   }
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 8db18c049b6d0..7f4912b9bcd96 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -1003,9 +1003,6 @@ ANNOTATION(module_include)
 ANNOTATION(module_begin)
 ANNOTATION(module_end)
 
-// Annotations for C++, Clang and Objective-C named modules.
-ANNOTATION(module_name)
-
 // Annotation for a header_name token that has been looked up and transformed
 // into the name of a header unit.
 ANNOTATION(header_unit)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9c6cebd77ff0a..8ccbc4b372ba3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5553,6 +5553,10 @@ def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">,
   MarshallingInfoFlag<CodeGenOpts<"InstrumentForProfiling">>;
 def pipe : Flag<["-", "--"], "pipe">,
   HelpText<"Use pipes between commands, when possible">;
+// Facebook T92898286
+def post_link_optimize : Flag<["--"], "post-link-optimize">,
+  HelpText<"Apply post-link optimizations using BOLT">;
+// End Facebook T92898286
 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
 def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 56aef99a3f38a..fc7d0053f2323 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -615,6 +615,10 @@ class Preprocessor {
 
   ModuleDeclSeq ModuleDeclState;
 
+  /// Whether the module import expects an identifier next. Otherwise,
+  /// it expects a '.' or ';'.
+  bool ModuleImportExpectsIdentifier = false;
+
   /// The identifier and source location of the currently-active
   /// \#pragma clang arc_cf_code_audited begin.
   std::pair<IdentifierInfo *, SourceLocation> PragmaARCCFCodeAuditedInfo;
@@ -1740,14 +1744,11 @@ class Preprocessor {
   /// Lex a token, forming a header-name token if possible.
   bool LexHeaderName(Token &Result, bool AllowMacroExpansion = true);
 
-  /// Lex a module name or a partition name.
-  bool LexModuleName(Token &Result, bool IsImport);
-
   /// Lex the parameters for an #embed directive, returns nullopt on error.
   std::optional<LexEmbedParametersResult> LexEmbedParameters(Token &Current,
                                                              bool ForHasEmbed);
+
   bool LexAfterModuleImport(Token &Result);
-  bool LexAfterModuleDecl(Token &Result);
   void CollectPpImportSuffix(SmallVectorImpl<Token> &Toks);
 
   void makeModuleVisible(Module *M, SourceLocation Loc);
@@ -3038,9 +3039,6 @@ class Preprocessor {
   static bool CLK_LexAfterModuleImport(Preprocessor &P, Token &Result) {
     return P.LexAfterModuleImport(Result);
   }
-  static bool CLK_LexAfterModuleDecl(Preprocessor &P, Token &Result) {
-    return P.LexAfterModuleDecl(Result);
-  }
 };
 
 /// Abstract base class that describes a handler that will receive
@@ -3073,77 +3071,6 @@ struct EmbedAnnotationData {
 /// Registry of pragma handlers added by plugins
 using PragmaHandlerRegistry = llvm::Registry<PragmaHandler>;
 
-/// Represents module or partition name token sequance.
-///
-///     module-name:
-///           module-name-qualifier[opt] identifier
-///
-///     partition-name: [C++20]
-///           : module-name-qualifier[opt] identifier
-///
-///     module-name-qualifier
-///           module-name-qualifier[opt] identifier .
-///
-/// This class can only be created by the preprocessor and guarantees that the
-/// two source array being contiguous in memory and only contains 3 kind of
-/// tokens (identifier, '.' and ':'). And only available when the preprocessor
-/// returns annot_module_name token.
-///
-/// For exmaple:
-///
-/// export module m.n:c.d
-///
-/// The module name array has 3 tokens ['m', '.', 'n'].
-/// The partition name array has 4 tokens [':', 'c', '.', 'd'].
-///
-/// When import a partition in a named module fragment (Eg. import :part1;),
-/// the module name array will be empty, and the partition name array has 2
-/// tokens.
-///
-/// When we meet a private-module-fragment (Eg. module :private;), preprocessor
-/// will not return a annot_module_name token, but will return 2 separate tokens
-/// [':', 'kw_private'].
-
-class ModuleNameInfo {
-  friend class Preprocessor;
-  ArrayRef<Token> ModuleName;
-  ArrayRef<Token> PartitionName;
-
-  ModuleNameInfo(ArrayRef<Token> AnnotToks, std::optional<unsigned> ColonIndex);
-
-public:
-  /// Return the contiguous token array.
-  ArrayRef<Token> getTokens() const {
-    if (ModuleName.empty())
-      return PartitionName;
-    if (PartitionName.empty())
-      return ModuleName;
-    return ArrayRef(ModuleName.begin(), PartitionName.end());
-  }
-  bool hasModuleName() const { return !ModuleName.empty(); }
-  bool hasPartitionName() const { return !PartitionName.empty(); }
-  ArrayRef<Token> getModuleName() const { return ModuleName; }
-  ArrayRef<Token> getPartitionName() const { return PartitionName; }
-  Token getColonToken() const {
-    assert(hasPartitionName() && "Do not have a partition name");
-    return getPartitionName().front();
-  }
-
-  /// Under the standard C++ Modules, the dot is just part of the module name,
-  /// and not a real hierarchy separator. Flatten such module names now.
-  std::string getFlatName() const;
-
-  /// Build a module id path from the contiguous token array, both include
-  /// module name and partition name.
-  void getModuleIdPath(
-      SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path) const;
-
-  /// Build a module id path from \param ModuleName.
-  static void getModuleIdPath(
-      ArrayRef<Token> ModuleName,
-      SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path);
-};
-
 } // namespace clang
 
 #endif // LLVM_CLANG_LEX_PREPROCESSOR_H
diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h
index 2be3ad39529f0..4f29fb7d11415 100644
--- a/clang/include/clang/Lex/Token.h
+++ b/clang/include/clang/Lex/Token.h
@@ -235,9 +235,6 @@ class Token {
     assert(isAnnotation() && "Used AnnotVal on non-annotation token");
     return PtrData;
   }
-  template <class T> T getAnnotationValueAs() const {
-    return static_cast<T>(getAnnotationValue());
-  }
   void setAnnotationValue(void *val) {
     assert(isAnnotation() && "Used AnnotVal on non-annotation token");
     PtrData = val;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index afcdacf02583a..93e60be512aae 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3876,7 +3876,7 @@ class Parser : public CodeCompletionHandler {
   }
 
   bool ParseModuleName(
-      SourceLocation UseLoc, ArrayRef<Token> ModuleName,
+      SourceLocation UseLoc,
       SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path,
       bool IsImport);
 
diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index ef579bc5d8972..0fc93c14131e6 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -1698,10 +1698,8 @@ bool Compiler<Emitter>::VisitUnaryExprOrTypeTraitExpr(
   if (Kind == UETT_VectorElements) {
     if (const auto *VT = E->getTypeOfArgument()->getAs<VectorType>())
       return this->emitConst(VT->getNumElements(), E);
-
-    // FIXME: Apparently we need to catch the fact that a sizeless vector type
-    // has been passed and diagnose that (at run time).
     assert(E->getTypeOfArgument()->isSizelessVectorType());
+    return this->emitSizelessVectorElementSize(E);
   }
 
   if (Kind == UETT_VecStep) {
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index b2581b5f7b5d0..b8e4432004e99 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -2736,6 +2736,15 @@ inline bool InvalidDeclRef(InterpState &S, CodePtr OpPC,
   return CheckDeclRef(S, OpPC, DR);
 }
 
+inline bool SizelessVectorElementSize(InterpState &S, CodePtr OpPC) {
+  if (S.inConstantContext()) {
+    const SourceRange &ArgRange = S.Current->getRange(OpPC);
+    const Expr *E = S.Current->getExpr(OpPC);
+    S.CCEDiag(E, diag::note_constexpr_non_const_vectorelements) << ArgRange;
+  }
+  return false;
+}
+
 inline bool Assume(InterpState &S, CodePtr OpPC) {
   const auto Val = S.Stk.pop<Boolean>();
 
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 49ebb156ab2fb..9f29fa9272711 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -733,6 +733,8 @@ def InvalidDeclRef : Opcode {
   let Args = [ArgDeclRef];
 }
 
+def SizelessVectorElementSize : Opcode;
+
 def Assume : Opcode;
 
 def ArrayDecay : Opcode;
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 97d830214f890..4f7ccaf4021d6 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -322,9 +322,8 @@ void IdentifierTable::AddKeywords(const LangOptions &LangOpts) {
   if (LangOpts.IEEE128)
     AddKeyword("__ieee128", tok::kw___float128, KEYALL, LangOpts, *this);
 
-  // Add the 'import' and 'module' contextual keyword.
+  // Add the 'import' contextual keyword.
   get("import").setModulesImport(true);
-  get("module").setModulesDeclaration(true);
 }
 
 /// Checks if the specified token kind represents a keyword in the
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 7c65fccb60855..f22321f0e738a 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -814,8 +814,7 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD,
       llvm::Constant *VTableAddressPoint =
           CGM.getCXXABI().getVTableAddressPoint(BaseSubobject(CD, Offset),
                                                 VTableClass);
-      if (auto Authentication =
-              CGM.getVTablePointerAuthentication(VTableClass)) {
+      if (auto Authentication = CGM.getVTablePointerAuthentication(CD)) {
         VTableAddressPoint = Emitter.tryEmitConstantSignedPointer(
             VTableAddressPoint, *Authentication);
         if (!VTableAddressPoint)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 364e55a628c34..601f7a355297d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6666,7 +6666,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     StringRef S0 = A->getValue(), S = S0;
     unsigned Size, Offset = 0;
     if (!Triple.isAArch64() && !Triple.isLoongArch() && !Triple.isRISCV() &&
-        !Triple.isX86())
+        !Triple.isX86() &&
+        !(!Triple.isOSAIX() && (Triple.getArch() == llvm::Triple::ppc ||
+                                Triple.getArch() == llvm::Triple::ppc64)))
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getAsString(Args) << TripleStr;
     else if (S.consumeInteger(10, Size) ||
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 52c2ee90b1b28..ff20deb9c4f86 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -672,12 +672,41 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // Facebook T92898286
+  if (Args.hasArg(options::OPT_post_link_optimize))
+    CmdArgs.push_back("-q");
+  // End Facebook T92898286
+
   Args.AddAllArgs(CmdArgs, options::OPT_T);
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
                                          Exec, CmdArgs, Inputs, Output));
+  // Facebook T92898286
+  if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename())
+    return;
+
+  const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv"));
+  ArgStringList MoveCmdArgs;
+  MoveCmdArgs.push_back(Output.getFilename());
+  const char *PreBoltBin =
+      Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt");
+  MoveCmdArgs.push_back(PreBoltBin);
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         MvExec, MoveCmdArgs, std::nullopt));
+
+  ArgStringList BoltCmdArgs;
+  const char *BoltExec =
+      Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt"));
+  BoltCmdArgs.push_back(PreBoltBin);
+  BoltCmdArgs.push_back("-reorder-blocks=reverse");
+  BoltCmdArgs.push_back("-update-debug-sections");
+  BoltCmdArgs.push_back("-o");
+  BoltCmdArgs.push_back(Output.getFilename());
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         BoltExec, BoltCmdArgs, std::nullopt));
+  // End Facebook T92898286
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 1fff88ccf0405..0592423c12eca 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -758,10 +758,9 @@ void PrintPPOutputPPCallbacks::HandleWhitespaceBeforeTok(const Token &Tok,
   // These tokens are not expanded to anything and don't need whitespace before
   // them.
   if (Tok.is(tok::eof) ||
-      (Tok.isAnnotation() && Tok.isNot(tok::annot_header_unit) &&
-       Tok.isNot(tok::annot_module_begin) && Tok.isNot(tok::annot_module_end) &&
-       Tok.isNot(tok::annot_module_name) &&
-       Tok.isNot(tok::annot_repl_input_end) && Tok.isNot(tok::annot_embed)))
+      (Tok.isAnnotation() && !Tok.is(tok::annot_header_unit) &&
+       !Tok.is(tok::annot_module_begin) && !Tok.is(tok::annot_module_end) &&
+       !Tok.is(tok::annot_repl_input_end) && !Tok.is(tok::annot_embed)))
     return;
 
   // EmittedDirectiveOnThisLine takes priority over RequireSameLine.
@@ -952,11 +951,6 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
       PP.Lex(Tok);
       IsStartOfLine = true;
       continue;
-    } else if (Tok.is(tok::annot_module_name)) {
-      auto *Info = static_cast<ModuleNameInfo *>(Tok.getAnnotationValue());
-      *Callbacks->OS << Info->getFlatName();
-      PP.Lex(Tok);
-      continue;
     } else if (Tok.is(tok::annot_header_unit)) {
       // This is a header-name that has been (effectively) converted into a
       // module-name.
diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp
index c3a903917e9ce..8221db46e06ac 100644
--- a/clang/lib/Lex/PPLexerChange.cpp
+++ b/clang/lib/Lex/PPLexerChange.cpp
@@ -122,8 +122,7 @@ void Preprocessor::EnterSourceFileWithLexer(Lexer *TheLexer,
   CurPPLexer = TheLexer;
   CurDirLookup = CurDir;
   CurLexerSubmodule = nullptr;
-  if (CurLexerCallback != CLK_LexAfterModuleImport &&
-      CurLexerCallback != CLK_LexAfterModuleDecl)
+  if (CurLexerCallback != CLK_LexAfterModuleImport)
     CurLexerCallback = TheLexer->isDependencyDirectivesLexer()
                            ? CLK_DependencyDirectivesLexer
                            : CLK_Lexer;
@@ -162,7 +161,8 @@ void Preprocessor::EnterMacro(Token &Tok, SourceLocation ILEnd,
   PushIncludeMacroStack();
   CurDirLookup = nullptr;
   CurTokenLexer = std::move(TokLexer);
-  CurLexerCallback = CLK_TokenLexer;
+  if (CurLexerCallback != CLK_LexAfterModuleImport)
+    CurLexerCallback = CLK_TokenLexer;
 }
 
 /// EnterTokenStream - Add a "macro" context to the top of the include stack,
@@ -216,8 +216,7 @@ void Preprocessor::EnterTokenStream(const Token *Toks, unsigned NumToks,
   PushIncludeMacroStack();
   CurDirLookup = nullptr;
   CurTokenLexer = std::move(TokLexer);
-  if (CurLexerCallback != CLK_LexAfterModuleImport &&
-      CurLexerCallback != CLK_LexAfterModuleDecl)
+  if (CurLexerCallback != CLK_LexAfterModuleImport)
     CurLexerCallback = CLK_TokenLexer;
 }
 
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 2726fae344337..63e27e62cffc8 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -860,15 +860,9 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
     ModuleImportLoc = Identifier.getLocation();
     NamedModuleImportPath.clear();
     IsAtImport = true;
+    ModuleImportExpectsIdentifier = true;
     CurLexerCallback = CLK_LexAfterModuleImport;
   }
-
-  if ((II.isModulesDeclaration() || Identifier.is(tok::kw_module)) &&
-      !InMacroArgs && !DisableMacroExpansion &&
-      (getLangOpts().CPlusPlusModules || getLangOpts().DebuggerSupport) &&
-      CurLexerCallback != CLK_CachingLexer) {
-    CurLexerCallback = CLK_LexAfterModuleDecl;
-  }
   return true;
 }
 
@@ -911,7 +905,6 @@ void Preprocessor::Lex(Token &Result) {
     // This token is injected to represent the translation of '#include "a.h"'
     // into "import a.h;". Mimic the notional ';'.
     case tok::annot_module_include:
-    case tok::annot_repl_input_end:
     case tok::semi:
       TrackGMFState.handleSemi();
       StdCXXImportSeqState.handleSemi();
@@ -926,30 +919,12 @@ void Preprocessor::Lex(Token &Result) {
       StdCXXImportSeqState.handleExport();
       ModuleDeclState.handleExport();
       break;
-    case tok::annot_module_name: {
-      auto *Info = static_cast<ModuleNameInfo *>(Result.getAnnotationValue());
-      for (const auto &Tok : Info->getTokens()) {
-        switch (Tok.getKind()) {
-        case tok::identifier:
-          ModuleDeclState.handleIdentifier(Tok.getIdentifierInfo());
-          break;
-        case tok::period:
-          ModuleDeclState.handlePeriod();
-          break;
-        case tok::colon:
-          ModuleDeclState.handleColon();
-          break;
-        default:
-          llvm_unreachable("Unexpected token in module name");
-        }
-      }
-      if (ModuleDeclState.isModuleCandidate())
-        break;
-      TrackGMFState.handleMisc();
-      StdCXXImportSeqState.handleMisc();
-      ModuleDeclState.handleMisc();
+    case tok::colon:
+      ModuleDeclState.handleColon();
+      break;
+    case tok::period:
+      ModuleDeclState.handlePeriod();
       break;
-    }
     case tok::identifier:
       // Check "import" and "module" when there is no open bracket. The two
       // identifiers are not meaningful with open brackets.
@@ -961,17 +936,17 @@ void Preprocessor::Lex(Token &Result) {
             ModuleImportLoc = Result.getLocation();
             NamedModuleImportPath.clear();
             IsAtImport = false;
+            ModuleImportExpectsIdentifier = true;
             CurLexerCallback = CLK_LexAfterModuleImport;
           }
           break;
-        }
-        if (Result.getIdentifierInfo()->isModulesDeclaration()) {
+        } else if (Result.getIdentifierInfo() == getIdentifierInfo("module")) {
           TrackGMFState.handleModule(StdCXXImportSeqState.afterTopLevelSeq());
           ModuleDeclState.handleModule();
-          CurLexerCallback = CLK_LexAfterModuleDecl;
           break;
         }
       }
+      ModuleDeclState.handleIdentifier(Result.getIdentifierInfo());
       if (ModuleDeclState.isModuleCandidate())
         break;
       [[fallthrough]];
@@ -1146,151 +1121,6 @@ void Preprocessor::CollectPpImportSuffix(SmallVectorImpl<Token> &Toks) {
   }
 }
 
-ModuleNameInfo::ModuleNameInfo(ArrayRef<Token> AnnotToks,
-                               std::optional<unsigned> ColonIndex) {
-  assert(!AnnotToks.empty() && "Named module token cannot be empty.");
-  if (!ColonIndex.has_value())
-    ColonIndex = AnnotToks.size();
-  ModuleName = ArrayRef(AnnotToks.begin(), AnnotToks.begin() + *ColonIndex);
-  PartitionName = ArrayRef(AnnotToks.begin() + *ColonIndex, AnnotToks.end());
-  assert(ModuleName.end() == PartitionName.begin());
-}
-
-std::string ModuleNameInfo::getFlatName() const {
-  std::string FlatModuleName;
-  for (auto &Tok : getTokens()) {
-    switch (Tok.getKind()) {
-    case tok::identifier:
-      FlatModuleName += Tok.getIdentifierInfo()->getName();
-      break;
-    case tok::period:
-      FlatModuleName += '.';
-      break;
-    case tok::colon:
-      FlatModuleName += ':';
-      break;
-    default:
-      llvm_unreachable("Unexpected token in module name");
-    }
-  }
-  return FlatModuleName;
-}
-
-void ModuleNameInfo::getModuleIdPath(
-    SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path) const {
-  return getModuleIdPath(getTokens(), Path);
-}
-
-void ModuleNameInfo::getModuleIdPath(
-    ArrayRef<Token> ModuleName,
-    SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path) {
-  for (const auto &Tok : ModuleName) {
-    if (Tok.is(tok::identifier))
-      Path.push_back(
-          std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation()));
-  }
-}
-
-/// Lex a module name or a partition name.
-///
-///     module-name:
-///           module-name-qualifier[opt] identifier
-///
-///     partition-name: [C++20]
-///           : module-name-qualifier[opt] identifier
-///
-///     module-name-qualifier
-///           module-name-qualifier[opt] identifier .
-bool Preprocessor::LexModuleName(Token &Result, bool IsImport) {
-  bool ExpectsIdentifier = true, IsLexingPartition = false;
-  SmallVector<Token, 8> ModuleName;
-  std::optional<unsigned> ColonTokIndex;
-  auto LexNextToken = [&](Token &Tok) {
-    if (IsImport)
-      Lex(Tok);
-    else
-      LexUnexpandedToken(Tok);
-  };
-
-  while (true) {
-    LexNextToken(Result);
-    if (ExpectsIdentifier && Result.is(tok::identifier)) {
-      auto *MI = getMacroInfo(Result.getIdentifierInfo());
-      if (getLangOpts().CPlusPlusModules && !IsImport && MI &&
-          MI->isObjectLike()) {
-        Diag(Result, diag::err_module_decl_cannot_be_macros)
-            << Result.getLocation() << IsLexingPartition
-            << Result.getIdentifierInfo();
-      }
-      ModuleName.push_back(Result);
-      ExpectsIdentifier = false;
-      continue;
-    }
-
-    if (!ExpectsIdentifier && Result.is(tok::period)) {
-      ModuleName.push_back(Result);
-      ExpectsIdentifier = true;
-      continue;
-    }
-
-    // Module partition only allowed in C++20 Modules.
-    if (getLangOpts().CPlusPlusModules && Result.is(tok::colon)) {
-      // Handle the form like: import :P;
-      // If the token after ':' is not an identifier, this is a invalid module
-      // name.
-      if (ModuleName.empty()) {
-        Token Tmp;
-        LexNextToken(Tmp);
-        EnterToken(Tmp, /*IsReiject=*/false);
-        // A private-module-fragment:
-        // export module :private;
-        if (!IsImport && Tmp.is(tok::kw_private))
-          return true;
-        // import :N;
-        if (IsImport && Tmp.isNot(tok::identifier))
-          return false;
-      } else if (!ExpectsIdentifier) {
-        ExpectsIdentifier = true;
-      }
-      IsLexingPartition = true;
-      ColonTokIndex = ModuleName.size();
-      ModuleName.push_back(Result);
-      continue;
-    }
-
-    // [cpp.module]/p2: where the pp-tokens (if any) shall not begin with a (
-    // preprocessing token [...]
-    //
-    // We only emit diagnostic in the preprocessor, and in the parser we skip
-    // invalid tokens and recover from errors.
-    if (getLangOpts().CPlusPlusModules && !ExpectsIdentifier &&
-        Result.is(tok::l_paren))
-      Diag(Result, diag::err_unxepected_paren_in_module_decl)
-          << IsLexingPartition;
-    break;
-  }
-
-  // Put the last token back to stream, it's not a valid part of module name.
-  // We lexed it unexpanded but it might be a valid macro expansion
-  Result.clearFlag(Token::DisableExpand);
-  auto ToksCopy = std::make_unique<Token[]>(1);
-  *ToksCopy.get() = Result;
-  EnterTokenStream(std::move(ToksCopy), 1,
-                   /*DisableMacroExpansion=*/false,
-                   /*IsReinject=*/false);
-
-  if (ModuleName.empty())
-    return false;
-  Result.startToken();
-  Result.setKind(tok::annot_module_name);
-  Result.setLocation(ModuleName.front().getLocation());
-  Result.setAnnotationEndLoc(ModuleName.back().getLocation());
-  auto AnnotToks = ArrayRef(ModuleName).copy(getPreprocessorAllocator());
-  ModuleNameInfo *Info =
-      new (getPreprocessorAllocator()) ModuleNameInfo(AnnotToks, ColonTokIndex);
-  Result.setAnnotationValue(static_cast<void *>(Info));
-  return true;
-}
 
 /// Lex a token following the 'import' contextual keyword.
 ///
@@ -1315,17 +1145,6 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
   // Figure out what kind of lexer we actually have.
   recomputeCurLexerKind();
 
-  // Allocate a holding buffer for a sequence of tokens and introduce it into
-  // the token stream.
-  auto EnterTokens = [this](ArrayRef<Token> Toks) {
-    auto ToksCopy = std::make_unique<Token[]>(Toks.size());
-    std::copy(Toks.begin(), Toks.end(), ToksCopy.get());
-    EnterTokenStream(std::move(ToksCopy), Toks.size(),
-                     /*DisableMacroExpansion*/ true, /*IsReinject*/ false);
-  };
-
-  SmallVector<Token, 32> Suffix;
-
   // Lex the next token. The header-name lexing rules are used at the start of
   // a pp-import.
   //
@@ -1336,108 +1155,122 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
     if (LexHeaderName(Result))
       return true;
 
-    // Check for a header-name.
-    if (Result.is(tok::header_name)) {
-      // Enter the header-name token into the token stream; a Lex action cannot
-      // both return a token and cache tokens (doing so would corrupt the token
-      // cache if the call to Lex comes from CachingLex / PeekAhead).
-      Suffix.push_back(Result);
-
-      // Consume the pp-import-suffix and expand any macros in it now. We'll add
-      // it back into the token stream later.
-      CollectPpImportSuffix(Suffix);
-      if (Suffix.back().isNot(tok::semi)) {
-        // This is not a pp-import after all.
-        EnterTokens(Suffix);
-        return false;
-      }
+    if (Result.is(tok::colon) && ModuleDeclState.isNamedModule()) {
+      std::string Name = ModuleDeclState.getPrimaryName().str();
+      Name += ":";
+      NamedModuleImportPath.push_back(
+          {getIdentifierInfo(Name), Result.getLocation()});
+      CurLexerCallback = CLK_LexAfterModuleImport;
+      return true;
+    }
+  } else {
+    Lex(Result);
+  }
 
-      // C++2a [cpp.module]p1:
-      //   The ';' preprocessing-token terminating a pp-import shall not have
-      //   been produced by macro replacement.
-      SourceLocation SemiLoc = Suffix.back().getLocation();
-      if (SemiLoc.isMacroID())
-        Diag(SemiLoc, diag::err_header_import_semi_in_macro);
-
-      // Reconstitute the import token.
-      Token ImportTok;
-      ImportTok.startToken();
-      ImportTok.setKind(tok::kw_import);
-      ImportTok.setLocation(ModuleImportLoc);
-      ImportTok.setIdentifierInfo(getIdentifierInfo("import"));
-      ImportTok.setLength(6);
-
-      auto Action = HandleHeaderIncludeOrImport(
-          /*HashLoc*/ SourceLocation(), ImportTok, Suffix.front(), SemiLoc);
-      switch (Action.Kind) {
-      case ImportAction::None:
-        break;
+  // Allocate a holding buffer for a sequence of tokens and introduce it into
+  // the token stream.
+  auto EnterTokens = [this](ArrayRef<Token> Toks) {
+    auto ToksCopy = std::make_unique<Token[]>(Toks.size());
+    std::copy(Toks.begin(), Toks.end(), ToksCopy.get());
+    EnterTokenStream(std::move(ToksCopy), Toks.size(),
+                     /*DisableMacroExpansion*/ true, /*IsReinject*/ false);
+  };
 
-      case ImportAction::ModuleBegin:
-        // Let the parser know we're textually entering the module.
-        Suffix.emplace_back();
-        Suffix.back().startToken();
-        Suffix.back().setKind(tok::annot_module_begin);
-        Suffix.back().setLocation(SemiLoc);
-        Suffix.back().setAnnotationEndLoc(SemiLoc);
-        Suffix.back().setAnnotationValue(Action.ModuleForHeader);
-        [[fallthrough]];
-
-      case ImportAction::ModuleImport:
-      case ImportAction::HeaderUnitImport:
-      case ImportAction::SkippedModuleImport:
-        // We chose to import (or textually enter) the file. Convert the
-        // header-name token into a header unit annotation token.
-        Suffix[0].setKind(tok::annot_header_unit);
-        Suffix[0].setAnnotationEndLoc(Suffix[0].getLocation());
-        Suffix[0].setAnnotationValue(Action.ModuleForHeader);
-        // FIXME: Call the moduleImport callback?
-        break;
-      case ImportAction::Failure:
-        assert(TheModuleLoader.HadFatalFailure &&
-               "This should be an early exit only to a fatal error");
-        Result.setKind(tok::eof);
-        CurLexer->cutOffLexing();
-        EnterTokens(Suffix);
-        return true;
-      }
+  bool ImportingHeader = Result.is(tok::header_name);
+  // Check for a header-name.
+  SmallVector<Token, 32> Suffix;
+  if (ImportingHeader) {
+    // Enter the header-name token into the token stream; a Lex action cannot
+    // both return a token and cache tokens (doing so would corrupt the token
+    // cache if the call to Lex comes from CachingLex / PeekAhead).
+    Suffix.push_back(Result);
 
+    // Consume the pp-import-suffix and expand any macros in it now. We'll add
+    // it back into the token stream later.
+    CollectPpImportSuffix(Suffix);
+    if (Suffix.back().isNot(tok::semi)) {
+      // This is not a pp-import after all.
       EnterTokens(Suffix);
       return false;
     }
-  } else {
-    Lex(Result);
-  }
 
-  if (Result.isOneOf(tok::identifier, tok::colon)) {
-    EnterToken(Result, /*IsReinject=*/false);
-    if (!LexModuleName(Result, /*IsImport=*/true))
+    // C++2a [cpp.module]p1:
+    //   The ';' preprocessing-token terminating a pp-import shall not have
+    //   been produced by macro replacement.
+    SourceLocation SemiLoc = Suffix.back().getLocation();
+    if (SemiLoc.isMacroID())
+      Diag(SemiLoc, diag::err_header_import_semi_in_macro);
+
+    // Reconstitute the import token.
+    Token ImportTok;
+    ImportTok.startToken();
+    ImportTok.setKind(tok::kw_import);
+    ImportTok.setLocation(ModuleImportLoc);
+    ImportTok.setIdentifierInfo(getIdentifierInfo("import"));
+    ImportTok.setLength(6);
+
+    auto Action = HandleHeaderIncludeOrImport(
+        /*HashLoc*/ SourceLocation(), ImportTok, Suffix.front(), SemiLoc);
+    switch (Action.Kind) {
+    case ImportAction::None:
+      break;
+
+    case ImportAction::ModuleBegin:
+      // Let the parser know we're textually entering the module.
+      Suffix.emplace_back();
+      Suffix.back().startToken();
+      Suffix.back().setKind(tok::annot_module_begin);
+      Suffix.back().setLocation(SemiLoc);
+      Suffix.back().setAnnotationEndLoc(SemiLoc);
+      Suffix.back().setAnnotationValue(Action.ModuleForHeader);
+      [[fallthrough]];
+
+    case ImportAction::ModuleImport:
+    case ImportAction::HeaderUnitImport:
+    case ImportAction::SkippedModuleImport:
+      // We chose to import (or textually enter) the file. Convert the
+      // header-name token into a header unit annotation token.
+      Suffix[0].setKind(tok::annot_header_unit);
+      Suffix[0].setAnnotationEndLoc(Suffix[0].getLocation());
+      Suffix[0].setAnnotationValue(Action.ModuleForHeader);
+      // FIXME: Call the moduleImport callback?
+      break;
+    case ImportAction::Failure:
+      assert(TheModuleLoader.HadFatalFailure &&
+             "This should be an early exit only to a fatal error");
+      Result.setKind(tok::eof);
+      CurLexer->cutOffLexing();
+      EnterTokens(Suffix);
       return true;
-    auto *Info = Result.getAnnotationValueAs<ModuleNameInfo *>();
-    if (getLangOpts().CPlusPlusModules) {
-      // Under the standard C++ Modules, the dot is just part of the module
-      // name, and not a real hierarchy separator. Flatten such module names
-      // now.
-      //
-      // FIXME: Is this the right level to be performing this transformation?
-      std::string FlatModuleName;
-      if (Info->getTokens().front().is(tok::colon)) {
-        // Import a module partition allowed in C++20 Modules.
-        // We can import a partition in named module TU.
-        if (NamedModuleImportPath.empty() && ModuleDeclState.isNamedModule())
-          FlatModuleName = llvm::Twine(ModuleDeclState.getPrimaryName())
-                               .concat(Info->getFlatName())
-                               .str();
-        else
-          return true;
-      } else {
-        FlatModuleName = Info->getFlatName();
-      }
-      NamedModuleImportPath.emplace_back(getIdentifierInfo(FlatModuleName),
-                                         Result.getLocation());
-    } else {
-      Info->getModuleIdPath(NamedModuleImportPath);
     }
+
+    EnterTokens(Suffix);
+    return false;
+  }
+
+  // The token sequence
+  //
+  //   import identifier (. identifier)*
+  //
+  // indicates a module import directive. We already saw the 'import'
+  // contextual keyword, so now we're looking for the identifiers.
+  if (ModuleImportExpectsIdentifier && Result.getKind() == tok::identifier) {
+    // We expected to see an identifier here, and we did; continue handling
+    // identifiers.
+    NamedModuleImportPath.push_back(
+        std::make_pair(Result.getIdentifierInfo(), Result.getLocation()));
+    ModuleImportExpectsIdentifier = false;
+    CurLexerCallback = CLK_LexAfterModuleImport;
+    return true;
+  }
+
+  // If we're expecting a '.' or a ';', and we got a '.', then wait until we
+  // see the next identifier. (We can also see a '[[' that begins an
+  // attribute-specifier-seq here under the Standard C++ Modules.)
+  if (!ModuleImportExpectsIdentifier && Result.getKind() == tok::period) {
+    ModuleImportExpectsIdentifier = true;
+    CurLexerCallback = CLK_LexAfterModuleImport;
+    return true;
   }
 
   // If we didn't recognize a module name at all, this is not a (valid) import.
@@ -1458,6 +1291,24 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
     SemiLoc = Suffix.back().getLocation();
   }
 
+  // Under the standard C++ Modules, the dot is just part of the module name,
+  // and not a real hierarchy separator. Flatten such module names now.
+  //
+  // FIXME: Is this the right level to be performing this transformation?
+  std::string FlatModuleName;
+  if (getLangOpts().CPlusPlusModules) {
+    for (auto &Piece : NamedModuleImportPath) {
+      // If the FlatModuleName ends with colon, it implies it is a partition.
+      if (!FlatModuleName.empty() && FlatModuleName.back() != ':')
+        FlatModuleName += ".";
+      FlatModuleName += Piece.first->getName();
+    }
+    SourceLocation FirstPathLoc = NamedModuleImportPath[0].second;
+    NamedModuleImportPath.clear();
+    NamedModuleImportPath.push_back(
+        std::make_pair(getIdentifierInfo(FlatModuleName), FirstPathLoc));
+  }
+
   Module *Imported = nullptr;
   // We don't/shouldn't load the standard c++20 modules when preprocessing.
   if (getLangOpts().Modules && !isInImportingCXXNamedModules()) {
@@ -1479,33 +1330,6 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
   return true;
 }
 
-/// Lex a token following the 'module' contextual keyword.
-///
-/// [cpp.module]/p2:
-/// The pp-tokens, if any, of a pp-module shall be of the form:
-///       pp-module-name pp-module-partition[opt] pp-tokens[opt]
-///
-/// where the pp-tokens (if any) shall not begin with a ( preprocessing token
-/// and the grammar non-terminals are defined as:
-///       pp-module-name:
-///             pp-module-name-qualifierp[opt] identifier
-///       pp-module-partition:
-///             : pp-module-name-qualifier[opt] identifier
-///       pp-module-name-qualifier:
-///             identifier .
-///             pp-module-name-qualifier identifier .
-/// No identifier in the pp-module-name or pp-module-partition shall currently
-/// be defined as an object-like macro.
-///
-/// [cpp.module]/p3:
-/// Any preprocessing tokens after the module preprocessing token in the module
-/// directive are processed just as in normal text.
-bool Preprocessor::LexAfterModuleDecl(Token &Result) {
-  // Figure out what kind of lexer we actually have.
-  recomputeCurLexerKind();
-  return LexModuleName(Result, /*IsImport=*/false);
-}
-
 void Preprocessor::makeModuleVisible(Module *M, SourceLocation Loc) {
   CurSubmoduleState->VisibleModules.setVisible(
       M, Loc, [](Module *) {},
diff --git a/clang/lib/Lex/TokenConcatenation.cpp b/clang/lib/Lex/TokenConcatenation.cpp
index cdb636923b9e9..865879d180533 100644
--- a/clang/lib/Lex/TokenConcatenation.cpp
+++ b/clang/lib/Lex/TokenConcatenation.cpp
@@ -160,13 +160,6 @@ static char GetFirstChar(const Preprocessor &PP, const Token &Tok) {
 bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok,
                                      const Token &PrevTok,
                                      const Token &Tok) const {
-  // If previous token is a module name, we need avoid concat it with current
-  // token, otherwise, there will has an extra space between 'M' and ';' for the
-  // following code:
-  //
-  // import M;
-  if (PrevTok.is(tok::annot_module_name))
-    return false;
   // Conservatively assume that every annotation token that has a printable
   // form requires whitespace.
   if (PrevTok.isAnnotation())
@@ -197,9 +190,6 @@ bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok,
       return true;
     ConcatInfo &= ~aci_avoid_equal;
   }
-
-  if (Tok.is(tok::annot_module_name))
-    return true;
   if (Tok.isAnnotation()) {
     // Modules annotation can show up when generated automatically for includes.
     assert(Tok.isOneOf(tok::annot_module_include, tok::annot_module_begin,
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 577527d0318f2..7ce9a9cea1c7a 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -3958,13 +3958,7 @@ void Parser::ParseDeclarationSpecifiers(
 
       // We're done with the declaration-specifiers.
       goto DoneWithDeclSpec;
-    case tok::annot_module_name: {
-      PP.EnterTokenStream(
-          Tok.getAnnotationValueAs<ModuleNameInfo *>()->getTokens(),
-          /*DisableMacroExpansion=*/true, /*IsReinject=*/false);
-      ConsumeAnyToken();
-      [[fallthrough]];
-    }
+
       // typedef-name
     case tok::kw___super:
     case tok::kw_decltype:
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index afb2e1e416168..5ebe71e496a2e 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -2511,28 +2511,18 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   }
 
   SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> Path;
-  if (Tok.isNot(tok::annot_module_name)) {
-    Diag(Tok, diag::err_module_expected_ident) << /*IsImport=*/false;
-    SkipUntil(tok::semi, StopBeforeMatch);
-    return nullptr;
-  }
-
-  auto *Info = Tok.getAnnotationValueAs<ModuleNameInfo *>();
-  ConsumeAnnotationToken();
-  if (ParseModuleName(ModuleLoc, Info->getModuleName(), Path,
-                      /*IsImport=*/false))
+  if (ParseModuleName(ModuleLoc, Path, /*IsImport*/ false))
     return nullptr;
 
   // Parse the optional module-partition.
   SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> Partition;
-  if (Info->hasPartitionName()) {
-    SourceLocation ColonLoc = Info->getColonToken().getLocation();
+  if (Tok.is(tok::colon)) {
+    SourceLocation ColonLoc = ConsumeToken();
     if (!getLangOpts().CPlusPlusModules)
       Diag(ColonLoc, diag::err_unsupported_module_partition)
           << SourceRange(ColonLoc, Partition.back().second);
     // Recover by ignoring the partition name.
-    else if (ParseModuleName(ModuleLoc, Info->getPartitionName(), Partition,
-                             /*IsImport=*/false))
+    else if (ParseModuleName(ModuleLoc, Partition, /*IsImport*/ false))
       return nullptr;
   }
 
@@ -2591,32 +2581,18 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
     // This is a header import that the preprocessor mapped to a module import.
     HeaderUnit = reinterpret_cast<Module *>(Tok.getAnnotationValue());
     ConsumeAnnotationToken();
-  } else {
-    if (Tok.isNot(tok::annot_module_name)) {
-      if (Tok.is(tok::code_completion)) {
-        cutOffParsing();
-        Actions.CodeCompletion().CodeCompleteModuleImport(ImportLoc, Path);
-        return nullptr;
-      }
-      Diag(Tok, diag::err_module_expected_ident) << /*IsImport=*/true;
-      SkipUntil(tok::semi, StopBeforeMatch);
+  } else if (Tok.is(tok::colon)) {
+    SourceLocation ColonLoc = ConsumeToken();
+    if (!getLangOpts().CPlusPlusModules)
+      Diag(ColonLoc, diag::err_unsupported_module_partition)
+          << SourceRange(ColonLoc, Path.back().second);
+    // Recover by leaving partition empty.
+    else if (ParseModuleName(ColonLoc, Path, /*IsImport*/ true))
       return nullptr;
-    }
-    auto *Info = Tok.getAnnotationValueAs<ModuleNameInfo *>();
-    ConsumeAnnotationToken();
-    if (Info->hasPartitionName()) {
-      SourceLocation ColonLoc = Info->getColonToken().getLocation();
-      if (!getLangOpts().CPlusPlusModules)
-        Diag(ColonLoc, diag::err_unsupported_module_partition)
-            << SourceRange(ColonLoc, Path.back().second);
-      // Recover by leaving partition empty.
-      else if (ParseModuleName(ColonLoc, Info->getPartitionName(), Path,
-                               /*IsImport=*/true))
-        return nullptr;
-      else
-        IsPartition = true;
-    } else if (ParseModuleName(ImportLoc, Info->getModuleName(), Path,
-                               /*IsImport=*/true))
+    else
+      IsPartition = true;
+  } else {
+    if (ParseModuleName(ImportLoc, Path, /*IsImport*/ true))
       return nullptr;
   }
 
@@ -2713,31 +2689,32 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
 ///         module-name-qualifier:
 ///           module-name-qualifier[opt] identifier '.'
 bool Parser::ParseModuleName(
-    SourceLocation UseLoc, ArrayRef<Token> ModuleName,
+    SourceLocation UseLoc,
     SmallVectorImpl<std::pair<IdentifierInfo *, SourceLocation>> &Path,
     bool IsImport) {
-  ModuleNameInfo::getModuleIdPath(ModuleName, Path);
-  // Eg. import A.B.
-  if (ModuleName.back().isNot(tok::identifier)) {
-    if (Tok.is(tok::code_completion)) {
-      cutOffParsing();
-      Actions.CodeCompletion().CodeCompleteModuleImport(UseLoc, Path);
+  // Parse the module path.
+  while (true) {
+    if (!Tok.is(tok::identifier)) {
+      if (Tok.is(tok::code_completion)) {
+        cutOffParsing();
+        Actions.CodeCompletion().CodeCompleteModuleImport(UseLoc, Path);
+        return true;
+      }
+
+      Diag(Tok, diag::err_module_expected_ident) << IsImport;
+      SkipUntil(tok::semi);
       return true;
     }
-    Diag(ModuleName.back(), diag::err_module_expected_ident) << IsImport;
-    SkipUntil(tok::semi, StopBeforeMatch);
-    return true;
-  }
 
-  // [cpp.module]/p2: where the pp-tokens (if any) shall not begin with a (
-  // preprocessing token [...]
-  //
-  // Skip unitl ';' to recovery.
-  if (getLangOpts().CPlusPlusModules && Tok.is(tok::l_paren)) {
-    SkipUntil(tok::semi, StopBeforeMatch);
-    return true;
+    // Record this part of the module path.
+    Path.push_back(std::make_pair(Tok.getIdentifierInfo(), Tok.getLocation()));
+    ConsumeToken();
+
+    if (Tok.isNot(tok::period))
+      return false;
+
+    ConsumeToken();
   }
-  return false;
 }
 
 /// Try recover parser when module annotation appears where it must not
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 50e7be9ea6020..5fd8622c90dd8 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5247,6 +5247,10 @@ static void handleXRayLogArgsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
 static void handlePatchableFunctionEntryAttr(Sema &S, Decl *D,
                                              const ParsedAttr &AL) {
+  if (S.Context.getTargetInfo().getTriple().isOSAIX()) {
+    S.Diag(AL.getLoc(), diag::err_aix_attr_unsupported) << AL;
+    return;
+  }
   uint32_t Count = 0, Offset = 0;
   if (!S.checkUInt32Argument(AL, AL.getArgAsExpr(0), Count, 0, true))
     return;
diff --git a/clang/test/CXX/cpp/cpp.module/p2.cppm b/clang/test/CXX/cpp/cpp.module/p2.cppm
deleted file mode 100644
index 966a88ccfa972..0000000000000
--- a/clang/test/CXX/cpp/cpp.module/p2.cppm
+++ /dev/null
@@ -1,88 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
-
-// RUN: %clang_cc1 -std=c++20 %t/A.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/B.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/C.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/D.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/E.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/F.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/G.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/H.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/I.cppm -triple x86_64-linux-gnu -verify
-// RUN: %clang_cc1 -std=c++20 %t/J.cppm -triple x86_64-linux-gnu -verify
-
-//--- version.h
-#ifndef VERSION_H
-#define VERSION_H
-
-#define VERSION libv5
-#define A a
-#define B b
-#define C c
-#define FUNC_LIKE(X) function_like_##X
-#define ATTRS [[]]
-#define SEMICOLON ;
-
-#endif // VERSION_H
-
-//--- A.cppm
-module;
-#include "version.h"
-export module VERSION;  // expected-error {{the module name in a module declaration cannot contain an object-like macro 'VERSION'}}
-
-//--- B.cppm
-module;
-#include "version.h"
-export module A.B;      // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
-                        // expected-error {{the module name in a module declaration cannot contain an object-like macro 'B'}}
-
-//--- C.cppm
-module;                             // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
-#include "version.h"
-export module A.FUNC_LIKE(foo):C;   // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
-                                    // expected-error {{unexpected '(' after the module name in a module declaration}}
-
-//--- D.cppm
-module;                               // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
-#include "version.h"
-export module B.A.FUNC_LIKE(bar):C;   // expected-error {{the module name in a module declaration cannot contain an object-like macro 'B'}} \
-                                      // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
-                                      // expected-error {{unexpected '(' after the module name in a module declaration}}
-
-//--- E.cppm
-module;
-#include "version.h"
-export module a.FUNC_LIKE:c; // OK, FUNC_LIKE would not be treated as a macro name.
-// expected-no-diagnostics
-
-//--- F.cppm
-module;
-#include "version.h"
-export module a.FUNC_LIKE:c ATTRS; // OK, FUNC_LIKE would not be treated as a macro name.
-// expected-no-diagnostics
-
-//--- G.cppm
-module;                               // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
-#include "version.h"
-export module A.FUNC_LIKE(B c:C ATTRS // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
-                                      // expected-error {{unexpected '(' after the module name in a module declaration}}
-
-//--- H.cppm
-module;                                   // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
-#include "version.h"
-export module A.FUNC_LIKE(B,). c:C ATTRS  // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
-                                          // expected-error {{unexpected '(' after the module name in a module declaration}}
-
-//--- I.cppm
-module;                                   // expected-error {{missing 'module' declaration at end of global module fragment introduced here}}
-#include "version.h"
-export module A.FUNC_LIKE(B,) c:C ATTRS   // expected-error {{the module name in a module declaration cannot contain an object-like macro 'A'}} \
-                                          // expected-error {{unexpected '(' after the module name in a module declaration}}
-
-//--- J.cppm
-module;
-#include "version.h"
-export module unexpanded : unexpanded ATTRS SEMICOLON // OK, ATTRS and SEMICOLON can be expanded.
-// expected-no-diagnostics
diff --git a/clang/test/CXX/module/basic/basic.link/module-declaration.cpp b/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
index 14bbc911febfc..d71358cc7a571 100644
--- a/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
+++ b/clang/test/CXX/module/basic/basic.link/module-declaration.cpp
@@ -8,19 +8,27 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface -fmodule-file=x=%t/x.pcm %t/x.y.cppm -o %t/x.y.pcm
 //
 // Module implementation for unknown and known module. (The former is ill-formed.)
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M1.cpp
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x=%t/x.pcm -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M2.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M.cpp \
+// RUN:            -DTEST=1 -DEXPORT= -DMODULE_NAME=z
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x=%t/x.pcm -fmodule-file=x.y=%t/x.y.pcm -verify -x c++ %t/M.cpp \
+// RUN:            -DTEST=2 -DEXPORT= -DMODULE_NAME=x
 //
 // Module interface for unknown and known module. (The latter is ill-formed due to
 // redefinition.)
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M3.cpp
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M4.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
+// RUN:            -DTEST=3 -DEXPORT=export -DMODULE_NAME=z
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
+// RUN:            -DTEST=4 -DEXPORT=export -DMODULE_NAME=x
 //
 // Miscellaneous syntax.
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M5.cpp
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M6.cpp
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M7.cpp
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M8.cpp
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
+// RUN:            -DTEST=7 -DEXPORT=export -DMODULE_NAME='z elderberry'
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
+// RUN:            -DTEST=8 -DEXPORT=export -DMODULE_NAME='z [[]]'
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
+// RUN:            -DTEST=9 -DEXPORT=export -DMODULE_NAME='z [[fancy]]'
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -verify %t/M.cpp \
+// RUN:            -DTEST=10 -DEXPORT=export -DMODULE_NAME='z [[maybe_unused]]'
 
 //--- x.cppm
 export module x;
@@ -30,26 +38,17 @@ int a, b;
 export module x.y;
 int c;
 
-//--- M1.cpp
-module z; // expected-error {{module 'z' not found}}
-
-//--- M2.cpp
-module x; // expected-no-diagnostics
-
-//--- M3.cpp
-export module z; // expected-no-diagnostics
-
-//--- M4.cpp
-export module x; // expected-no-diagnostics
-
-//--- M5.cpp
-export module z elderberry; // expected-error {{expected ';'}} expected-error {{a type specifier is required}}
-
-//--- M6.cpp
-export module z [[]]; // expected-no-diagnostics
-
-//--- M7.cpp
-export module z [[fancy]]; // expected-warning {{unknown attribute 'fancy' ignored}}
-
-//--- M8.cpp
-export module z [[maybe_unused]]; // expected-error-re {{'maybe_unused' attribute cannot be applied to a module{{$}}}}
+//--- M.cpp
+
+EXPORT module MODULE_NAME;
+#if TEST == 7
+// expected-error@-2 {{expected ';'}} expected-error@-2 {{a type specifier is required}}
+#elif TEST == 9
+// expected-warning@-4 {{unknown attribute 'fancy' ignored}}
+#elif TEST == 10
+// expected-error-re@-6 {{'maybe_unused' attribute cannot be applied to a module{{$}}}}
+#elif TEST == 1
+// expected-error@-8 {{module 'z' not found}}
+#else
+// expected-no-diagnostics
+#endif
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
index ecad4db32a7e9..873e4c0edeac2 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
@@ -6,12 +6,10 @@
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface -fmodule-file=x=%t/x.pcm %t/x.y.cppm -o %t/x.y.pcm
 // RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/a.b.cppm -o %t/a.b.pcm
 //
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm -verify %t/test-interface.cpp \
-// RUN:            -DINTERFACE
+// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm -verify %t/test.cpp \
+// RUN:            -DMODULE_NAME=z -DINTERFACE
 // RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm \
-// RUN:            -fmodule-file=a.b=%t/a.b.pcm -verify %t/test.cpp
-// RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm \
-// RUN:            -verify %t/test-module-not-found.cpp
+// RUN:            -fmodule-file=a.b=%t/a.b.pcm -verify %t/test.cpp -DMODULE_NAME=a.b
 // RUN: %clang_cc1 -std=c++20 -I%t -fmodule-file=x.y=%t/x.y.pcm -fmodule-file=x=%t/x.pcm -verify %t/test.x.cpp
 
 //--- x.cppm
@@ -36,26 +34,11 @@ int use_2 = b; // ok
 int use_3 = c; // expected-error {{use of undeclared identifier 'c'}}
 
 //--- test.cpp
-module;
-module a.b;
-
-import x;
-
-import x [[]];
-import x [[foo]]; // expected-warning {{unknown attribute 'foo' ignored}}
-import x [[noreturn]]; // expected-error {{'noreturn' attribute cannot be applied to a module import}}
-import x [[blarg::noreturn]]; // expected-warning {{unknown attribute 'noreturn' ignored}}
-
-import x.y;
-import x.; // expected-error {{expected a module name after 'import'}}
-import .x; // expected-error {{expected a module name after 'import'}}
-
-int use_4 = c; // ok
-
-
-//--- test-interface.cpp
-module;
-export module z;
+#ifdef INTERFACE
+export module MODULE_NAME;
+#else
+module MODULE_NAME;
+#endif
 
 import x;
 
@@ -68,10 +51,6 @@ import x.y;
 import x.; // expected-error {{expected a module name after 'import'}}
 import .x; // expected-error {{expected a module name after 'import'}}
 
-int use_4 = c; // ok
-
-//--- test-module-not-found.cpp
-module;
-
 import blarg; // expected-error {{module 'blarg' not found}}
 
+int use_4 = c; // ok
diff --git a/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp b/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
new file mode 100644
index 0000000000000..f0c3ea83d8958
--- /dev/null
+++ b/clang/test/CodeGenCXX/ptrauth-global-constant-initializers.cpp
@@ -0,0 +1,234 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-calls -fno-rtti -fptrauth-vtable-pointer-type-discrimination -fptrauth-vtable-pointer-address-discrimination -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: %struct.Base1 = type { ptr }
+// CHECK: %struct.Base2 = type { ptr }
+// CHECK: %struct.Derived1 = type { %struct.Base1, %struct.Base2 }
+// CHECK: %struct.Derived2 = type { %struct.Base2, %struct.Base1 }
+// CHECK: %struct.Derived3 = type { %struct.Base1, %struct.Base2 }
+
+// CHECK: @_ZTV5Base1 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC:38871]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2))] }, align 8
+// CHECK: @g_b1 = global %struct.Base1 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC:6511]], ptr @g_b1) }, align 8
+// CHECK: @_ZTV5Base2 = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC:27651]], ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2))] }, align 8
+// CHECK: @g_b2 = global %struct.Base2 { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV5Base2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC:63631]], ptr @g_b2) }, align 8
+// CHECK: @_ZTV8Derived1 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived11cEv, i32 0, i64 [[DERIVED1_C_DISC:54092]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived11dEv, i32 0, i64 [[DERIVED1_D_DISC:37391]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2))] }, align 8
+// CHECK: @g_d1 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d1), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived1, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d1, i32 0, i32 1)) }, align 8
+// CHECK: @_ZTV8Derived2 = linkonce_odr unnamed_addr constant { [5 x ptr], [3 x ptr] } { [5 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived21cEv, i32 0, i64 [[DERIVED2_C_DISC:15537]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN8Derived21eEv, i32 0, i64 209, ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 4))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2))] }, align 8
+// CHECK: @g_d2 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 24) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 0, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr @g_d2), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [5 x ptr], [3 x ptr] }, ptr @_ZTV8Derived2, i32 0, i32 1, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d2, i32 0, i32 1)) }, align 8
+// CHECK: @_ZTV8Derived3 = linkonce_odr unnamed_addr constant { [4 x ptr], [3 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN8Derived31iEv, i32 0, i64 [[DERIVED3_I_DISC:19084]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 3))], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2))] }, align 8
+// CHECK: @g_d3 = global { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 0, i32 2), i32 2, i64 [[BASE1_VTABLE_DISC]], ptr @g_d3), ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV8Derived3, i32 0, i32 1, i32 2), i32 2, i64 [[BASE2_VTABLE_DISC]], ptr getelementptr inbounds ({ ptr, ptr }, ptr @g_d3, i32 0, i32 1)) }, align 8
+// CHECK: @g_vb1 = global %struct.VirtualBase1 zeroinitializer, align 8
+// CHECK: @g_vb2 = global %struct.VirtualBase2 zeroinitializer, align 8
+// CHECK: @g_d4 = global %struct.Derived4 zeroinitializer, align 8
+// CHECK: @_ZTV12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC:7987]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 5))] }, align 8
+// CHECK: @_ZTT12VirtualBase1 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTV12VirtualBase1, i32 0, i32 0, i32 4), i32 2)], align 8
+// CHECK: @_ZTV12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC:51224]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3))] }, align 8
+// CHECK: @_ZTT12VirtualBase2 = linkonce_odr unnamed_addr constant [2 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTV12VirtualBase2, i32 0, i32 1, i32 3), i32 2)], align 8
+// CHECK: @_ZTV8Derived4 = linkonce_odr unnamed_addr constant { [7 x ptr], [5 x ptr] } { [7 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 5)), ptr ptrauth (ptr @_ZN8Derived41hEv, i32 0, i64 [[DERIVED4_H_DISC:31844]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 6))], [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr inttoptr (i64 -8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 4))] }, align 8
+// CHECK: @_ZTT8Derived4 = linkonce_odr unnamed_addr constant [7 x ptr] [ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 16) ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-32, 24) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 0, i32 4), i32 2), ptr ptrauth (ptr getelementptr inbounds inrange(-24, 16) ({ [7 x ptr], [5 x ptr] }, ptr @_ZTV8Derived4, i32 0, i32 1, i32 3), i32 2)], align 8
+// CHECK: @_ZTC8Derived40_12VirtualBase1 = linkonce_odr unnamed_addr constant { [6 x ptr] } { [6 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 4)), ptr ptrauth (ptr @_ZN12VirtualBase11fEv, i32 0, i64 [[VIRTUALBASE1_F_DISC]], ptr getelementptr inbounds ({ [6 x ptr] }, ptr @_ZTC8Derived40_12VirtualBase1, i32 0, i32 0, i32 5))] }, align 8
+// CHECK: @_ZTC8Derived48_12VirtualBase2 = linkonce_odr unnamed_addr constant { [5 x ptr], [4 x ptr] } { [5 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr null, ptr ptrauth (ptr @_ZN5Base21bEv, i32 0, i64 [[BASE2_B_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 3)), ptr ptrauth (ptr @_ZN12VirtualBase21gEv, i32 0, i64 [[VIRTUALBASE2_G_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 0, i32 4))], [4 x ptr] [ptr null, ptr inttoptr (i64 8 to ptr), ptr null, ptr ptrauth (ptr @_ZN5Base11aEv, i32 0, i64 [[BASE1_A_DISC]], ptr getelementptr inbounds ({ [5 x ptr], [4 x ptr] }, ptr @_ZTC8Derived48_12VirtualBase2, i32 0, i32 1, i32 3))] }, align 8
+
+struct Base1 { virtual void a() {} };
+struct Base2 { virtual void b() {} };
+struct Derived1 : public Base1, public Base2 {
+  virtual void c() {}
+  virtual void d() {}
+};
+struct Derived2 : public Base2, public Base1 {
+  virtual void c() {}
+  virtual void e() {}
+};
+
+struct Derived3 : public Base1, public Base2 {
+  constexpr Derived3(){}
+  virtual void i() {}
+};
+
+Base1 g_b1;
+Base2 g_b2;
+Derived1 g_d1;
+Derived2 g_d2;
+Derived3 g_d3;
+
+extern "C" void test_basic_inheritance() {
+  Base1 g_b1;
+  Base2 g_b2;
+  Derived1 g_d1;
+  Derived2 g_d2;
+  Derived3 g_d3;
+}
+
+struct VirtualBase1 : virtual Base1 {
+  VirtualBase1(){}
+  virtual void f() {}
+};
+struct VirtualBase2 : virtual Base1, Base2 {
+  VirtualBase2(){}
+  virtual void g() {}
+};
+struct Derived4 : VirtualBase1, VirtualBase2 {
+  virtual void h() {}
+};
+struct Derived5 : VirtualBase2, VirtualBase1 {
+  virtual void h() {}
+};
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN12VirtualBase1C1Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN12VirtualBase2C1Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived4C1Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived5C1Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+
+VirtualBase1 g_vb1;
+VirtualBase2 g_vb2;
+Derived4 g_d4;
+Derived5 g_d5;
+
+
+extern "C" void cross_check_vtables(Base1 *b1,
+                   Base2 *b2,
+                   Derived1 *d1,
+                   Derived2 *d2,
+                   Derived3 *d3,
+                   VirtualBase1 *vb1,
+                   VirtualBase2 *vb2,
+                   Derived4 *d4,
+                   Derived4 *d5) {
+  asm("; b1->a()" ::: "memory");
+  b1->a();
+  asm("; b2->b()" ::: "memory");
+  b2->b();
+  asm("; d1->a()" ::: "memory");
+  d1->a();
+  asm("; d1->c()" ::: "memory");
+  d1->c();
+  asm("; d2->a()" ::: "memory");
+  d2->a();
+  asm("; d2->c()" ::: "memory");
+  d2->c();
+  asm("; d3->a()" ::: "memory");
+  d3->a();
+  asm("; d3->b()" ::: "memory");
+  d3->b();
+  asm("; d3->i()" ::: "memory");
+  d3->i();
+  asm("; vb1->a()" ::: "memory");
+  vb1->a();
+  asm("; vb1->f()" ::: "memory");
+  vb1->f();
+  asm("; vb2->a()" ::: "memory");
+  vb2->a();
+  asm("; vb2->g()" ::: "memory");
+  vb2->g();
+  asm("; d4->a()" ::: "memory");
+  d4->a();
+  asm("; d4->b()" ::: "memory");
+  d4->b();
+  asm("; d4->f()" ::: "memory");
+  d4->f();
+  asm("; d4->g()" ::: "memory");
+  d4->g();
+  asm("; d4->h()" ::: "memory");
+  d4->h();
+  asm("; d5->a()" ::: "memory");
+  d5->a();
+  asm("; d5->b()" ::: "memory");
+  d5->b();
+  asm("; d5->f()" ::: "memory");
+  d5->f();
+  asm("; d5->g()" ::: "memory");
+  d5->g();
+  asm("; d5->h()" ::: "memory");
+  d5->h();
+}
+
+// CHECK-LABEL: define void @cross_check_vtables(
+// CHECK: "; b1->a()",
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; b2->b()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
+// CHECK: "; d1->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; d1->c()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED1_C_DISC]])
+// CHECK: "; d2->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; d2->c()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED2_C_DISC]])
+// CHECK: "; d3->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; d3->b()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
+// CHECK: "; d3->i()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED3_I_DISC]])
+// CHECK: "; vb1->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; vb1->f()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE1_F_DISC]])
+// CHECK: "; vb2->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; vb2->g()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE2_G_DISC]])
+// CHECK: "; d4->a()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_A_DISC]])
+// CHECK: "; d4->b()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_B_DISC]])
+// CHECK: "; d4->f()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE1_F_DISC]])
+// CHECK: "; d4->g()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[VIRTUALBASE2_G_DISC]])
+// CHECK: "; d4->h()"
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[DERIVED4_H_DISC]])
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN5Base1C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN5Base2C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived1C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived2C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+
+// CHECK-LABEL: define {{.*}} ptr @_ZN8Derived3C2Ev
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE1_VTABLE_DISC]])
+// CHECK: call i64 @llvm.ptrauth.blend(i64 {{%.*}}, i64 [[BASE2_VTABLE_DISC]])
+
diff --git a/clang/test/Driver/fpatchable-function-entry.c b/clang/test/Driver/fpatchable-function-entry.c
index ab04fd39ffa1c..5f07ca99a69de 100644
--- a/clang/test/Driver/fpatchable-function-entry.c
+++ b/clang/test/Driver/fpatchable-function-entry.c
@@ -6,6 +6,8 @@
 // RUN: %clang --target=loongarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // RUN: %clang --target=riscv32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // RUN: %clang --target=riscv64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=powerpc-unknown-linux-gnu %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=powerpc64-unknown-linux-gnu %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // CHECK: "-fpatchable-function-entry=1"
 
 // RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=1,1 -c -### 2>&1 | FileCheck --check-prefix=11 %s
@@ -13,8 +15,11 @@
 // RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=2,1 -c -### 2>&1 | FileCheck --check-prefix=21 %s
 // 21: "-fpatchable-function-entry=2" "-fpatchable-function-entry-offset=1"
 
-// RUN: not %clang --target=ppc64 -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=TARGET %s
-// TARGET: error: unsupported option '-fpatchable-function-entry=1' for target 'ppc64'
+// RUN: not %clang --target=powerpc64-ibm-aix-xcoff -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=AIX64 %s
+// AIX64: error: unsupported option '-fpatchable-function-entry=1' for target 'powerpc64-ibm-aix-xcoff'
+
+// RUN: not %clang --target=powerpc-ibm-aix-xcoff -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=AIX32 %s
+// AIX32: error: unsupported option '-fpatchable-function-entry=1' for target 'powerpc-ibm-aix-xcoff'
 
 // RUN: not %clang --target=x86_64 -fsyntax-only %s -fpatchable-function-entry=1,0, 2>&1 | FileCheck --check-prefix=EXCESS %s
 // EXCESS: error: invalid argument '1,0,' to -fpatchable-function-entry=
diff --git a/clang/test/Sema/patchable-function-entry-attr.cpp b/clang/test/Sema/patchable-function-entry-attr.cpp
index 9134c851da588..bd4d57a7e3093 100644
--- a/clang/test/Sema/patchable-function-entry-attr.cpp
+++ b/clang/test/Sema/patchable-function-entry-attr.cpp
@@ -6,9 +6,14 @@
 // RUN: %clang_cc1 -triple loongarch64 -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple riscv32 -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple riscv64 -fsyntax-only -verify=silence %s
+// RUN: %clang_cc1 -triple powerpc-unknown-linux-gnu -fsyntax-only -verify=silence %s
+// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -fsyntax-only -verify=silence %s
 // RUN: %clang_cc1 -triple ppc64le -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix-xcoff -fsyntax-only -verify=AIX %s
+// RUN: %clang_cc1 -triple powerpc-ibm-aix-xcoff -fsyntax-only -verify=AIX %s
 
 // silence-no-diagnostics
 
+// AIX-error@+2 {{'patchable_function_entry' attribute is not yet supported on AIX}}
 // expected-warning@+1 {{unknown attribute 'patchable_function_entry' ignored}}
 [[gnu::patchable_function_entry(0)]] void f();
diff --git a/clang/test/SemaCXX/builtin_vectorelements.cpp b/clang/test/SemaCXX/builtin_vectorelements.cpp
index 59ff09ac72e42..b23675ea0ac6a 100644
--- a/clang/test/SemaCXX/builtin_vectorelements.cpp
+++ b/clang/test/SemaCXX/builtin_vectorelements.cpp
@@ -1,7 +1,9 @@
 // RUN: %clang_cc1 -triple x86_64                       -std=c++20 -fsyntax-only -verify -disable-llvm-passes %s
+// RUN: %clang_cc1 -triple x86_64                       -std=c++20 -fsyntax-only -verify -disable-llvm-passes -fexperimental-new-constant-interpreter %s
 
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -std=c++20 -fsyntax-only -verify -disable-llvm-passes %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -std=c++20 -fsyntax-only -verify -disable-llvm-passes -fexperimental-new-constant-interpreter %s
 
 template <typename T>
 using VecT __attribute__((vector_size(16))) = T;
diff --git a/clang/test/SemaCXX/modules.cppm b/clang/test/SemaCXX/modules.cppm
index 267417bf5da2c..41204be76eafa 100644
--- a/clang/test/SemaCXX/modules.cppm
+++ b/clang/test/SemaCXX/modules.cppm
@@ -1,17 +1,19 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-// RUN: split-file %s %t
+// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -o %t.0.pcm -verify -DTEST=0
+// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -o %t.1.pcm -verify -DTEST=1
+// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -fmodule-file=foo=%t.0.pcm -o %t.2.pcm -verify -DTEST=2
+// RUN:     %clang_cc1 -std=c++20 -emit-module-interface %s -fmodule-file=foo=%t.0.pcm -o %t.3.pcm -verify -Dfoo=bar -DTEST=3
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/A.cppm -o %t.0.pcm -verify
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/B.cppm -o %t.1.pcm -verify
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/C.cppm -fmodule-file=foo=%t.0.pcm -o %t.2.pcm -verify
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/D.cppm -fmodule-file=foo=%t.0.pcm -o %t.3.pcm -verify
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/E.cppm -fmodule-file=foo=%t.0.pcm -o %t.3.pcm -verify -Dfoo=bar
+#if TEST == 0 || TEST == 2
+// expected-no-diagnostics
+#endif
 
-//--- A.cppm
 export module foo;
+
 static int m;
+
 int n;
+
+#if TEST == 0
 export {
   int a;
   int b;
@@ -25,43 +27,7 @@ export void f() {}
 
 export struct T {
 } t;
-// expected-no-diagnostics
-
-//--- B.cppm
-export module foo;
-static int m;
-int n;
-struct S {
-  export int n;        // expected-error {{expected member name or ';'}}
-  export static int n; // expected-error {{expected member name or ';'}}
-};
-
-// FIXME: Exports of declarations without external linkage are disallowed.
-// Exports of declarations with non-external-linkage types are disallowed.
-
-// Cannot export within another export. This isn't precisely covered by the
-// language rules right now, but (per personal correspondence between zygoloid
-// and gdr) is the intent.
-export { // expected-note {{export block begins here}}
-  extern "C++" {
-    namespace NestedExport {
-      export { // expected-error {{export declaration appears within another export declaration}}
-        int q;
-      }
-    } // namespace NestedExport
-  }
-}
-
-//--- C.cppm
-export module foo;
-static int m;
-int n;
-// expected-no-diagnostics
-
-//--- D.cppm
-export module foo;
-static int m;
-int n;
+#elif TEST == 3
 int use_a = a; // expected-error {{use of undeclared identifier 'a'}}
 
 #undef foo
@@ -80,12 +46,29 @@ int use_n = n; // FIXME: this should not be visible, because it is not exported
 
 extern int n;
 static_assert(&n != p); // expected-error{{use of undeclared identifier 'p'}}
+#endif
 
-//--- E.cppm
-export module foo; // expected-error {{the module name in a module declaration cannot contain an object-like macro 'foo'}}
-static int m;
-int n;
-int use_a = a; // expected-error {{use of undeclared identifier 'a'}}
+#if TEST == 1
+struct S {
+  export int n;        // expected-error {{expected member name or ';'}}
+  export static int n; // expected-error {{expected member name or ';'}}
+};
+#endif
 
-#undef foo
-import foo; // expected-error {{imports must immediately follow the module declaration}}
+// FIXME: Exports of declarations without external linkage are disallowed.
+// Exports of declarations with non-external-linkage types are disallowed.
+
+// Cannot export within another export. This isn't precisely covered by the
+// language rules right now, but (per personal correspondence between zygoloid
+// and gdr) is the intent.
+#if TEST == 1
+export { // expected-note {{export block begins here}}
+  extern "C++" {
+  namespace NestedExport {
+  export { // expected-error {{export declaration appears within another export declaration}}
+    int q;
+  }
+  } // namespace NestedExport
+  }
+}
+#endif
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 1f69a4e8a5620..a6ded8be3ae9e 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -182,7 +182,7 @@ <h2 id="cxx26">C++2c implementation status</h2>
  <tr>
   <td>Module Declarations Shouldn’t be Macros</td>
   <td><a href="https://wg21.link/P3034R1">P3034R1</a> (<a href="#dr">DR</a>)</td>
-  <td class="unreleased" align="center">Clang 19</td>
+  <td class="none" align="center">No</td>
  </tr>
  <tr>
   <td>Trivial infinite loops are not Undefined Behavior</td>
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 032b04a09ae76..49c9dcbef358f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -6336,7 +6336,7 @@ INTERCEPTOR(void*, dlopen, const char *filename, int flag) {
 
       const char *SelfFName = DladdrSelfFName();
       VPrintf(1, "dlopen interceptor: DladdrSelfFName: %p %s\n",
-              (void *)SelfFName, SelfFName);
+              (const void *)SelfFName, SelfFName);
 
       if (SelfFName && internal_strcmp(SelfFName, filename) == 0) {
         // It's possible they copied the string from dladdr, so
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
index 7d7c009f64442..ee3fd5b0a817a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
@@ -47,7 +47,8 @@ void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   struct kinfo_proc2 *InfoProc;
   uptr Len = sizeof(*InfoProc);
   uptr Size = Len;
-  const int Mib[] = {CTL_KERN, KERN_PROC2, KERN_PROC_PID, getpid(), Size, 1};
+  const int Mib[] = {CTL_KERN, KERN_PROC2, KERN_PROC_PID,
+                     getpid(), (int)Size,  1};
   InfoProc = (struct kinfo_proc2 *)MmapOrDie(Size, "GetMemoryProfile()");
   CHECK_EQ(
       internal_sysctl(Mib, ARRAY_SIZE(Mib), nullptr, (uptr *)InfoProc, &Len, 0),
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
index dddae441d5de9..5631d132da74a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.cpp
@@ -269,7 +269,7 @@ void FormattedStackTracePrinter::RenderFrame(InternalScopedString *buffer,
       break;
     default:
       Report("Unsupported specifier in stack frame format: %c (%p)!\n", *p,
-             (void *)p);
+             (const void *)p);
       Die();
     }
   }
@@ -323,7 +323,7 @@ void FormattedStackTracePrinter::RenderData(InternalScopedString *buffer,
         break;
       default:
         Report("Unsupported specifier in stack frame format: %c (%p)!\n", *p,
-               (void *)p);
+               (const void *)p);
         Die();
     }
   }
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 774c4eaf4d976..619634578dfe6 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -84,7 +84,13 @@ def get_required_attr(config, attr_name):
 # use_clang() and use_lld() respectively, so set them to "", if needed.
 if not hasattr(config, "clang_src_dir"):
     config.clang_src_dir = ""
-llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# Facebook T92898286
+should_test_bolt = get_required_attr(config, "llvm_test_bolt")
+if should_test_bolt:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"])
+else:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# End Facebook T92898286
 
 if not hasattr(config, "lld_src_dir"):
     config.lld_src_dir = ""
@@ -293,3 +299,9 @@ def get_clang_default_dwarf_version_string(triple):
 # Allow 'REQUIRES: XXX-registered-target' in tests.
 for arch in config.targets_to_build:
     config.available_features.add(arch.lower() + "-registered-target")
+
+# Facebook T92898286
+# Ensure the user's PYTHONPATH is included.
+if "PYTHONPATH" in os.environ:
+    config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"]
+# End Facebook T92898286
diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in
index 39458dfc79afd..2d53cd377f033 100644
--- a/cross-project-tests/lit.site.cfg.py.in
+++ b/cross-project-tests/lit.site.cfg.py.in
@@ -21,6 +21,10 @@ config.mlir_src_root = "@MLIR_SOURCE_DIR@"
 
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 59de18c20417d..ac139cc21b0b6 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -151,10 +151,7 @@ void Benchmark::run_benchmarks() {
       all_results.reset();
 
     gpu::sync_threads();
-    if (!b->flags ||
-        ((b->flags & BenchmarkFlags::SINGLE_THREADED) && id == 0) ||
-        ((b->flags & BenchmarkFlags::SINGLE_WAVE) &&
-         id < gpu::get_lane_size())) {
+    if (b->num_threads == static_cast<uint32_t>(-1) || id < b->num_threads) {
       auto current_result = b->run();
       all_results.update(current_result);
     }
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index c07fab9ccfbe3..f5cf4822f6fd3 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -74,8 +74,6 @@ struct BenchmarkResult {
   clock_t total_time = 0;
 };
 
-enum BenchmarkFlags { SINGLE_THREADED = 0x1, SINGLE_WAVE = 0x2 };
-
 BenchmarkResult benchmark(const BenchmarkOptions &options,
                           cpp::function<uint64_t(void)> wrapper_func);
 
@@ -83,12 +81,13 @@ class Benchmark {
   const cpp::function<uint64_t(void)> func;
   const cpp::string_view suite_name;
   const cpp::string_view test_name;
-  const uint8_t flags;
+  const uint32_t num_threads;
 
 public:
   Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
-            char const *test_name, uint8_t flags)
-      : func(func), suite_name(suite_name), test_name(test_name), flags(flags) {
+            char const *test_name, uint32_t num_threads)
+      : func(func), suite_name(suite_name), test_name(test_name),
+        num_threads(num_threads) {
     add_benchmark(this);
   }
 
@@ -108,18 +107,21 @@ class Benchmark {
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
 
+// Passing -1 indicates the benchmark should be run with as many threads as
+// allocated by the user in the benchmark's CMake.
 #define BENCHMARK(SuiteName, TestName, Func)                                   \
   LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
-      Func, #SuiteName, #TestName, 0)
+      Func, #SuiteName, #TestName, -1)
 
-#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func)                   \
+#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads)             \
   LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
-      Func, #SuiteName, #TestName,                                             \
-      LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_THREADED)
+      Func, #SuiteName, #TestName, NumThreads)
+
+#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func)                   \
+  BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
 
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
-  LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance(     \
-      Func, #SuiteName, #TestName,                                             \
-      LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_WAVE)
+  BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
+                      LIBC_NAMESPACE::gpu::get_lane_size())
 
 #endif
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index ea3f36604e45d..6c557b7a69cb6 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -266,6 +266,10 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.stat.mkdirat
     libc.src.sys.stat.stat
 
+    # sys/statvfs.h
+    libc.src.sys.statvfs.fstatvfs
+    libc.src.sys.statvfs.statvfs
+
     # sys/utsname.h entrypoints
     libc.src.sys.utsname.uname
 
@@ -377,6 +381,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.cospif
+    libc.src.math.dmull
     libc.src.math.dsqrtl
     libc.src.math.erff
     libc.src.math.exp
@@ -434,6 +439,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmodf
     libc.src.math.fmodl
     libc.src.math.fmul
+    libc.src.math.fmull
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index da203e9850603..4bb8d23ab961a 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -44,6 +44,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.sys_select
     libc.include.sys_socket
     libc.include.sys_stat
+    libc.include.sys_statvfs
     libc.include.sys_syscall
     libc.include.sys_time
     libc.include.sys_types
diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index a963a92bfb074..3b7d7a5c249ae 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -11,8 +11,8 @@
 
 #include "FEnvImpl.h"
 #include "FPBits.h"
+#include "dyadic_float.h"
 
-#include "FEnvImpl.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
@@ -269,12 +269,21 @@ totalordermag(T x, T y) {
 template <typename T>
 LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> getpayload(T x) {
   using FPBits = FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
   FPBits x_bits(x);
 
   if (!x_bits.is_nan())
     return T(-1.0);
 
-  return T(x_bits.uintval() & (FPBits::FRACTION_MASK >> 1));
+  StorageType payload = x_bits.uintval() & (FPBits::FRACTION_MASK >> 1);
+
+  if constexpr (is_big_int_v<StorageType>) {
+    DyadicFloat<FPBits::STORAGE_LEN> payload_dfloat(Sign::POS, 0, payload);
+
+    return static_cast<T>(payload_dfloat);
+  } else {
+    return static_cast<T>(payload);
+  }
 }
 
 template <bool IsSignaling, typename T>
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 793d3a121c742..8804f3a4d5e23 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -75,19 +75,6 @@ add_header_library(
     libc.src.__support.common
 )
 
-add_header_library(
-  basic_operations
-  HDRS
-    BasicOperations.h
-  DEPENDS
-    .fp_bits
-    .fenv_impl
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.uint128
-    libc.src.__support.common
-    libc.src.__support.macros.optimization
-)
-
 add_header_library(
   division_and_remainder_operations
   HDRS
@@ -113,21 +100,6 @@ add_header_library(
 )
 
 
-add_header_library(
-  hypot
-  HDRS
-    Hypot.h
-  DEPENDS
-    .basic_operations
-    .fenv_impl
-    .fp_bits
-    .rounding_mode
-    libc.src.__support.common
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.uint128
-)
-
 add_header_library(
   sqrt
   HDRS
@@ -208,6 +180,35 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  basic_operations
+  HDRS
+    BasicOperations.h
+  DEPENDS
+    .dyadic_float
+    .fp_bits
+    .fenv_impl
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.uint128
+    libc.src.__support.common
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
+  hypot
+  HDRS
+    Hypot.h
+  DEPENDS
+    .basic_operations
+    .fenv_impl
+    .fp_bits
+    .rounding_mode
+    libc.src.__support.common
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.uint128
+)
+
 add_header_library(
   manipulation_functions
   HDRS
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 337301d86d919..e5683c8ff61ea 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -129,27 +129,27 @@ fma(InType x, InType y, InType z) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
-        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(x_bits.sign(),
-                                      static_cast<OutStorageType>(x_payload))
-              .get_val();
+        InStorageType x_payload = x_bits.get_mantissa();
+        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(x_bits.sign(),
+                                    static_cast<OutStorageType>(x_payload))
+            .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
-        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(y_bits.sign(),
-                                      static_cast<OutStorageType>(y_payload))
-              .get_val();
+        InStorageType y_payload = y_bits.get_mantissa();
+        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(y_bits.sign(),
+                                    static_cast<OutStorageType>(y_payload))
+            .get_val();
       }
 
       if (z_bits.is_quiet_nan()) {
-        InStorageType z_payload = static_cast<InStorageType>(getpayload(z));
-        if ((z_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(z_bits.sign(),
-                                      static_cast<OutStorageType>(z_payload))
-              .get_val();
+        InStorageType z_payload = z_bits.get_mantissa();
+        z_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(z_bits.sign(),
+                                    static_cast<OutStorageType>(z_payload))
+            .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
@@ -163,18 +163,27 @@ fma(InType x, InType y, InType z) {
   int y_exp = 0;
   int z_exp = 0;
 
+  // Denormal scaling = 2^(fraction length).
+  constexpr InStorageType IMPLICIT_MASK =
+      InFPBits::SIG_MASK - InFPBits::FRACTION_MASK;
+
+  constexpr InType DENORMAL_SCALING =
+      InFPBits::create_value(
+          Sign::POS, InFPBits::FRACTION_LEN + InFPBits::EXP_BIAS, IMPLICIT_MASK)
+          .get_val();
+
   // Normalize denormal inputs.
   if (LIBC_UNLIKELY(InFPBits(x).is_subnormal())) {
     x_exp -= InFPBits::FRACTION_LEN;
-    x *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
+    x *= DENORMAL_SCALING;
   }
   if (LIBC_UNLIKELY(InFPBits(y).is_subnormal())) {
     y_exp -= InFPBits::FRACTION_LEN;
-    y *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
+    y *= DENORMAL_SCALING;
   }
   if (LIBC_UNLIKELY(InFPBits(z).is_subnormal())) {
     z_exp -= InFPBits::FRACTION_LEN;
-    z *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
+    z *= DENORMAL_SCALING;
   }
 
   x_bits = InFPBits(x);
diff --git a/libc/src/__support/FPUtil/generic/add_sub.h b/libc/src/__support/FPUtil/generic/add_sub.h
index ec20a8723b704..850db3f83209e 100644
--- a/libc/src/__support/FPUtil/generic/add_sub.h
+++ b/libc/src/__support/FPUtil/generic/add_sub.h
@@ -56,19 +56,19 @@ add_or_sub(InType x, InType y) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
-        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(x_bits.sign(),
-                                      static_cast<OutStorageType>(x_payload))
-              .get_val();
+        InStorageType x_payload = x_bits.get_mantissa();
+        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(x_bits.sign(),
+                                    static_cast<OutStorageType>(x_payload))
+            .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
-        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(y_bits.sign(),
-                                      static_cast<OutStorageType>(y_payload))
-              .get_val();
+        InStorageType y_payload = y_bits.get_mantissa();
+        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(y_bits.sign(),
+                                    static_cast<OutStorageType>(y_payload))
+            .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
@@ -174,10 +174,12 @@ add_or_sub(InType x, InType y) {
     else
       aligned_min_mant_sticky = true;
 
+    InStorageType min_mant_sticky(static_cast<int>(aligned_min_mant_sticky));
+
     if (is_effectively_add)
-      result_mant = max_mant + (aligned_min_mant | aligned_min_mant_sticky);
+      result_mant = max_mant + (aligned_min_mant | min_mant_sticky);
     else
-      result_mant = max_mant - (aligned_min_mant | aligned_min_mant_sticky);
+      result_mant = max_mant - (aligned_min_mant | min_mant_sticky);
   }
 
   int result_exp = max_bits.get_exponent() - RESULT_FRACTION_LEN;
diff --git a/libc/src/__support/FPUtil/generic/div.h b/libc/src/__support/FPUtil/generic/div.h
index 27545786aea17..dad1772fce750 100644
--- a/libc/src/__support/FPUtil/generic/div.h
+++ b/libc/src/__support/FPUtil/generic/div.h
@@ -49,19 +49,19 @@ div(InType x, InType y) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
-        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(x_bits.sign(),
-                                      static_cast<OutStorageType>(x_payload))
-              .get_val();
+        InStorageType x_payload = x_bits.get_mantissa();
+        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(x_bits.sign(),
+                                    static_cast<OutStorageType>(x_payload))
+            .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
-        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(y_bits.sign(),
-                                      static_cast<OutStorageType>(y_payload))
-              .get_val();
+        InStorageType y_payload = y_bits.get_mantissa();
+        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(y_bits.sign(),
+                                    static_cast<OutStorageType>(y_payload))
+            .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
diff --git a/libc/src/__support/FPUtil/generic/mul.h b/libc/src/__support/FPUtil/generic/mul.h
index 02fc69c6cb1ba..20d9a77792762 100644
--- a/libc/src/__support/FPUtil/generic/mul.h
+++ b/libc/src/__support/FPUtil/generic/mul.h
@@ -50,19 +50,19 @@ mul(InType x, InType y) {
         raise_except_if_required(FE_INVALID);
 
       if (x_bits.is_quiet_nan()) {
-        InStorageType x_payload = static_cast<InStorageType>(getpayload(x));
-        if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(x_bits.sign(),
-                                      static_cast<OutStorageType>(x_payload))
-              .get_val();
+        InStorageType x_payload = x_bits.get_mantissa();
+        x_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(x_bits.sign(),
+                                    static_cast<OutStorageType>(x_payload))
+            .get_val();
       }
 
       if (y_bits.is_quiet_nan()) {
-        InStorageType y_payload = static_cast<InStorageType>(getpayload(y));
-        if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0)
-          return OutFPBits::quiet_nan(y_bits.sign(),
-                                      static_cast<OutStorageType>(y_payload))
-              .get_val();
+        InStorageType y_payload = y_bits.get_mantissa();
+        y_payload >>= InFPBits::FRACTION_LEN - OutFPBits::FRACTION_LEN;
+        return OutFPBits::quiet_nan(y_bits.sign(),
+                                    static_cast<OutStorageType>(y_payload))
+            .get_val();
       }
 
       return OutFPBits::quiet_nan().get_val();
diff --git a/libc/test/src/math/smoke/AddTest.h b/libc/test/src/math/smoke/AddTest.h
index c713c5a88763c..0b7e395a22d4c 100644
--- a/libc/test/src/math/smoke/AddTest.h
+++ b/libc/test/src/math/smoke/AddTest.h
@@ -38,23 +38,8 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
-      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
-    }
+    EXPECT_FP_IS_NAN(func(qnan_42, zero));
+    EXPECT_FP_IS_NAN(func(zero, qnan_42));
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/math/smoke/DivTest.h b/libc/test/src/math/smoke/DivTest.h
index 71cfb326b97cc..b30fc17aac1d5 100644
--- a/libc/test/src/math/smoke/DivTest.h
+++ b/libc/test/src/math/smoke/DivTest.h
@@ -36,23 +36,8 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
-      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
-    }
+    EXPECT_FP_IS_NAN(func(qnan_42, zero));
+    EXPECT_FP_IS_NAN(func(zero, qnan_42));
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/math/smoke/MulTest.h b/libc/test/src/math/smoke/MulTest.h
index e2298eaeeb216..0c847e39687b7 100644
--- a/libc/test/src/math/smoke/MulTest.h
+++ b/libc/test/src/math/smoke/MulTest.h
@@ -38,23 +38,8 @@ class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
-      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
-    }
+    EXPECT_FP_IS_NAN(func(qnan_42, zero));
+    EXPECT_FP_IS_NAN(func(zero, qnan_42));
 
     EXPECT_FP_EQ(inf, func(inf, InType(1.0)));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, InType(1.0)));
diff --git a/libc/test/src/math/smoke/SubTest.h b/libc/test/src/math/smoke/SubTest.h
index a4b3822aca43c..e5e04996affa8 100644
--- a/libc/test/src/math/smoke/SubTest.h
+++ b/libc/test/src/math/smoke/SubTest.h
@@ -37,23 +37,8 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID);
 
     InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val();
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero)));
-    EXPECT_FP_EQ(InType(0x42.0p+0),
-                 LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42)));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      InStorageType max_payload = InFPBits::FRACTION_MASK >> 1;
-      InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val();
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero)));
-      EXPECT_FP_EQ(zero,
-                   LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42)));
-      EXPECT_FP_EQ(InType(0x42.0p+0),
-                   LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max)));
-    }
+    EXPECT_FP_IS_NAN(func(qnan_42, zero));
+    EXPECT_FP_IS_NAN(func(zero, qnan_42));
 
     EXPECT_FP_EQ(inf, func(inf, zero));
     EXPECT_FP_EQ(neg_inf, func(neg_inf, zero));
diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
index 0895c33167151..8cb5f867453e4 100644
--- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
@@ -9,10 +9,16 @@
 #include <linux/magic.h>
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
+#ifdef SYS_statfs64
+using StatFs = statfs64;
+#else
+using StatFs = statfs;
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
-static int fstatfs(int fd, struct statfs *buf) {
+static int fstatfs(int fd, StatFs *buf) {
   using namespace statfs_utils;
-  if (cpp::optional<LinuxStatFs> result = linux_fstatfs(fd)) {
+  if (cpp::optional<StatFs> result = linux_fstatfs(fd)) {
     *buf = *result;
     return 0;
   }
@@ -29,7 +35,7 @@ struct PathFD {
 };
 
 TEST(LlvmLibcSysStatvfsTest, FstatfsBasic) {
-  struct statfs buf;
+  StatFs buf;
   ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/"), &buf), Succeeds());
   ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/proc"), &buf), Succeeds());
   ASSERT_EQ(buf.f_type, static_cast<decltype(buf.f_type)>(PROC_SUPER_MAGIC));
diff --git a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
index 6719c1ab26865..5329adb54d64d 100644
--- a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
@@ -6,8 +6,14 @@
 #include <linux/magic.h>
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
+#ifdef SYS_statfs64
+using StatFs = statfs64;
+#else
+using StatFs = statfs;
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
-static int statfs(const char *path, struct statfs *buf) {
+static int statfs(const char *path, StatFs *buf) {
   using namespace statfs_utils;
   if (cpp::optional<LinuxStatFs> result = linux_statfs(path)) {
     *buf = *result;
@@ -18,7 +24,7 @@ static int statfs(const char *path, struct statfs *buf) {
 } // namespace LIBC_NAMESPACE_DECL
 
 TEST(LlvmLibcSysStatfsTest, StatfsBasic) {
-  struct statfs buf;
+  StatFs buf;
   ASSERT_THAT(LIBC_NAMESPACE::statfs("/", &buf), Succeeds());
   ASSERT_THAT(LIBC_NAMESPACE::statfs("/proc", &buf), Succeeds());
   ASSERT_EQ(buf.f_type, static_cast<decltype(buf.f_type)>(PROC_SUPER_MAGIC));
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 6af89ce3517b7..56759c28dcf41 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1131,6 +1131,7 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A6C:
       return;
     };
+    break;
 
   case RISCVAtomicAbiTag::A6S:
     switch (newTag) {
@@ -1144,6 +1145,7 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A6S:
       return;
     };
+    break;
 
   case RISCVAtomicAbiTag::A7:
     switch (newTag) {
@@ -1157,6 +1159,7 @@ static void mergeAtomic(DenseMap<unsigned, unsigned>::iterator it,
     case RISCVAttrs::RISCVAtomicAbiTag::A7:
       return;
     };
+    break;
   };
 
   // If we get here, then we have an invalid tag, so report it.
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index e6b80c1d42d9e..0eb809282af28 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -28,8 +28,8 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/thread.h"
 #include "llvm/Support/xxhash.h"
 
 #include <algorithm>
@@ -66,7 +66,6 @@ class Writer {
 
   template <class LP> void run();
 
-  DefaultThreadPool threadPool;
   std::unique_ptr<FileOutputBuffer> &buffer;
   uint64_t addr = 0;
   uint64_t fileOff = 0;
@@ -1121,14 +1120,12 @@ void Writer::finalizeLinkEditSegment() {
       symtabSection,     indirectSymtabSection,
       dataInCodeSection, functionStartsSection,
   };
-  SmallVector<std::shared_future<void>> threadFutures;
-  threadFutures.reserve(linkEditSections.size());
-  for (LinkEditSection *osec : linkEditSections)
-    if (osec)
-      threadFutures.emplace_back(threadPool.async(
-          [](LinkEditSection *osec) { osec->finalizeContents(); }, osec));
-  for (std::shared_future<void> &future : threadFutures)
-    future.wait();
+
+  parallelForEach(linkEditSections.begin(), linkEditSections.end(),
+                  [](LinkEditSection *osec) {
+                    if (osec)
+                      osec->finalizeContents();
+                  });
 
   // Now that __LINKEDIT is filled out, do a proper calculation of its
   // addresses and offsets.
@@ -1170,6 +1167,8 @@ void Writer::openFile() {
 }
 
 void Writer::writeSections() {
+  TimeTraceScope timeScope("Write output sections");
+
   uint8_t *buf = buffer->getBufferStart();
   std::vector<const OutputSection *> osecs;
   for (const OutputSegment *seg : outputSegments)
@@ -1200,18 +1199,15 @@ void Writer::writeUuid() {
 
   ArrayRef<uint8_t> data{buffer->getBufferStart(), buffer->getBufferEnd()};
   std::vector<ArrayRef<uint8_t>> chunks = split(data, 1024 * 1024);
+
   // Leave one slot for filename
   std::vector<uint64_t> hashes(chunks.size() + 1);
-  SmallVector<std::shared_future<void>> threadFutures;
-  threadFutures.reserve(chunks.size());
-  for (size_t i = 0; i < chunks.size(); ++i)
-    threadFutures.emplace_back(threadPool.async(
-        [&](size_t j) { hashes[j] = xxh3_64bits(chunks[j]); }, i));
-  for (std::shared_future<void> &future : threadFutures)
-    future.wait();
+  parallelFor(0, chunks.size(),
+              [&](size_t i) { hashes[i] = xxh3_64bits(chunks[i]); });
   // Append the output filename so that identical binaries with different names
   // don't get the same UUID.
   hashes[chunks.size()] = xxh3_64bits(sys::path::filename(config->finalOutput));
+
   uint64_t digest = xxh3_64bits({reinterpret_cast<uint8_t *>(hashes.data()),
                                  hashes.size() * sizeof(uint64_t)});
   uuidCommand->writeUuid(digest);
@@ -1330,15 +1326,18 @@ template <class LP> void Writer::run() {
   sortSegmentsAndSections();
   createLoadCommands<LP>();
   finalizeAddresses();
-  threadPool.async([&] {
+
+  llvm::thread mapFileWriter([&] {
     if (LLVM_ENABLE_THREADS && config->timeTraceEnabled)
       timeTraceProfilerInitialize(config->timeTraceGranularity, "writeMapFile");
     writeMapFile();
     if (LLVM_ENABLE_THREADS && config->timeTraceEnabled)
       timeTraceProfilerFinishThread();
   });
+
   finalizeLinkEditSegment();
   writeOutputFile();
+  mapFileWriter.join();
 }
 
 template <class LP> void macho::writeResult() { Writer().run<LP>(); }
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 96520c7c82624..dfeb76544e57d 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -265,6 +265,11 @@ def delete_module_cache(path):
 if is_configured("lldb_framework_dir"):
     dotest_cmd += ["--framework", config.lldb_framework_dir]
 
+# Facebook T92898286
+if is_configured("llvm_test_bolt"):
+    dotest_cmd += ["-E", '"--post-link-optimize"']
+# End Facebook T92898286
+
 if (
     "lldb-repro-capture" in config.available_features
     or "lldb-repro-replay" in config.available_features
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 8b2d09ae41cd2..602f45759e48f 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -1,5 +1,9 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -39,6 +43,10 @@ config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 # Plugins
 lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@'
 if lldb_build_intel_pt == '1':
diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 255955fc70d8c..7b7be06643166 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -165,6 +165,11 @@ def use_support_substitutions(config):
     if config.cmake_sysroot:
         host_flags += ["--sysroot={}".format(config.cmake_sysroot)]
 
+    # Facebook T92898286
+    if config.llvm_test_bolt:
+        host_flags += ["--post-link-optimize"]
+    # End Facebook T92898286
+
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))
     config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags))
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index b69e7bce1bc0b..fe8323734b7db 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -1,5 +1,10 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -31,6 +36,10 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 12618966c4adf..a08b477060f48 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -709,6 +709,10 @@ set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF)
 
+# Facebook T92898286
+option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF)
+# End Facebook T92898286
+
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX"))
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 2c2f965a3cd6f..44e27234aa90d 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -822,15 +822,25 @@ bool isSafeToSpeculativelyExecute(const Instruction *I,
                                   const Instruction *CtxI = nullptr,
                                   AssumptionCache *AC = nullptr,
                                   const DominatorTree *DT = nullptr,
-                                  const TargetLibraryInfo *TLI = nullptr);
+                                  const TargetLibraryInfo *TLI = nullptr,
+                                  bool UseVariableInfo = true);
+
+inline bool isSafeToSpeculativelyExecute(const Instruction *I,
+                                         BasicBlock::iterator CtxI,
+                                         AssumptionCache *AC = nullptr,
+                                         const DominatorTree *DT = nullptr,
+                                         const TargetLibraryInfo *TLI = nullptr,
+                                         bool UseVariableInfo = true) {
+  // Take an iterator, and unwrap it into an Instruction *.
+  return isSafeToSpeculativelyExecute(I, &*CtxI, AC, DT, TLI, UseVariableInfo);
+}
 
+/// Don't use information from its non-constant operands. This helper is used
+/// when its operands are going to be replaced.
 inline bool
-isSafeToSpeculativelyExecute(const Instruction *I, BasicBlock::iterator CtxI,
-                             AssumptionCache *AC = nullptr,
-                             const DominatorTree *DT = nullptr,
-                             const TargetLibraryInfo *TLI = nullptr) {
-  // Take an iterator, and unwrap it into an Instruction *.
-  return isSafeToSpeculativelyExecute(I, &*CtxI, AC, DT, TLI);
+isSafeToSpeculativelyExecuteWithVariableReplaced(const Instruction *I) {
+  return isSafeToSpeculativelyExecute(I, nullptr, nullptr, nullptr, nullptr,
+                                      /*UseVariableInfo=*/false);
 }
 
 /// This returns the same result as isSafeToSpeculativelyExecute if Opcode is
@@ -853,7 +863,7 @@ isSafeToSpeculativelyExecute(const Instruction *I, BasicBlock::iterator CtxI,
 bool isSafeToSpeculativelyExecuteWithOpcode(
     unsigned Opcode, const Instruction *Inst, const Instruction *CtxI = nullptr,
     AssumptionCache *AC = nullptr, const DominatorTree *DT = nullptr,
-    const TargetLibraryInfo *TLI = nullptr);
+    const TargetLibraryInfo *TLI = nullptr, bool UseVariableInfo = true);
 
 /// Returns true if the result or effects of the given instructions \p I
 /// depend values not reachable through the def use graph.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 5c7f6ddc94840..649711d8faf65 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2226,6 +2226,12 @@ class TargetInstrInfo : public MCInstrInfo {
     return OptLevel >= CodeGenOptLevel::Aggressive ? 4 : 2;
   }
 
+  /// Returns the target-specific default value for tail merging.
+  /// This value will be used if the tail-merge-size argument is not provided.
+  virtual unsigned getTailMergeSize(const MachineFunction &MF) const {
+    return 3;
+  }
+
   /// Returns the callee operand from the given \p MI.
   virtual const MachineOperand &getCalleeOperand(const MachineInstr &MI) const {
     return MI.getOperand(0);
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
index b8b5f90b6b0fb..fa522ff88e1db 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
@@ -184,7 +184,7 @@ template <typename T> class AllocGroupSmallMap {
   iterator end() { return Elems.end(); }
   iterator find(AllocGroup G) {
     auto I = lower_bound(Elems, G, compareKey);
-    return (I->first == G) ? I : end();
+    return (I == end() || I->first == G) ? I : end();
   }
 
   bool empty() const { return Elems.empty(); }
diff --git a/llvm/include/llvm/Support/raw_socket_stream.h b/llvm/include/llvm/Support/raw_socket_stream.h
index eed865fb5af49..6c65a66dec9a4 100644
--- a/llvm/include/llvm/Support/raw_socket_stream.h
+++ b/llvm/include/llvm/Support/raw_socket_stream.h
@@ -92,13 +92,14 @@ class ListeningSocket {
   /// Accepts an incoming connection on the listening socket. This method can
   /// optionally either block until a connection is available or timeout after a
   /// specified amount of time has passed. By default the method will block
-  /// until the socket has recieved a connection.
+  /// until the socket has recieved a connection. If the accept timesout this
+  /// method will return std::errc:timed_out
   ///
   /// \param Timeout An optional timeout duration in milliseconds. Setting
-  /// Timeout to -1 causes accept to block indefinitely
+  /// Timeout to a negative number causes ::accept to block indefinitely
   ///
-  Expected<std::unique_ptr<raw_socket_stream>>
-  accept(std::chrono::milliseconds Timeout = std::chrono::milliseconds(-1));
+  Expected<std::unique_ptr<raw_socket_stream>> accept(
+      const std::chrono::milliseconds &Timeout = std::chrono::milliseconds(-1));
 
   /// Creates a listening socket bound to the specified file system path.
   /// Handles the socket creation, binding, and immediately starts listening for
@@ -124,11 +125,28 @@ class raw_socket_stream : public raw_fd_stream {
 
 public:
   raw_socket_stream(int SocketFD);
+  ~raw_socket_stream();
+
   /// Create a \p raw_socket_stream connected to the UNIX domain socket at \p
   /// SocketPath.
   static Expected<std::unique_ptr<raw_socket_stream>>
   createConnectedUnix(StringRef SocketPath);
-  ~raw_socket_stream();
+
+  /// Attempt to read from the raw_socket_stream's file descriptor.
+  ///
+  /// This method can optionally either block until data is read or an error has
+  /// occurred or timeout after a specified amount of time has passed. By
+  /// default the method will block until the socket has read data or
+  /// encountered an error. If the read times out this method will return
+  /// std::errc:timed_out
+  ///
+  /// \param Ptr The start of the buffer that will hold any read data
+  /// \param Size The number of bytes to be read
+  /// \param Timeout An optional timeout duration in milliseconds
+  ///
+  ssize_t read(
+      char *Ptr, size_t Size,
+      const std::chrono::milliseconds &Timeout = std::chrono::milliseconds(-1));
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index 7421dac2744b6..c75778952e0f5 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -24,6 +24,14 @@ class Triple;
 
 namespace RISCV {
 
+namespace RISCVExtensionBitmaskTable {
+struct RISCVExtensionBitmask {
+  const char *Name;
+  unsigned GroupID;
+  unsigned BitPosition;
+};
+} // namespace RISCVExtensionBitmaskTable
+
 // We use 64 bits as the known part in the scalable vector types.
 static constexpr unsigned RVVBitsPerBlock = 64;
 
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 03eb6ef42b0ff..8f1c42b7ede44 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -934,7 +934,7 @@ getKnownBitsFromAndXorOr(const Operator *I, const APInt &DemandedElts,
     // Demanded) == (xor(x, x-1) & Demanded). Extend the xor pattern
     // to use arbitrary C if xor(x, x-C) as the same as xor(x, x-1).
     if (HasKnownOne &&
-        match(I, m_c_Xor(m_Value(X), m_c_Add(m_Deferred(X), m_AllOnes())))) {
+        match(I, m_c_Xor(m_Value(X), m_Add(m_Deferred(X), m_AllOnes())))) {
       const KnownBits &XBits = I->getOperand(0) == X ? KnownLHS : KnownRHS;
       KnownOut = XBits.blsmsk();
     }
@@ -6771,15 +6771,16 @@ bool llvm::isSafeToSpeculativelyExecute(const Instruction *Inst,
                                         const Instruction *CtxI,
                                         AssumptionCache *AC,
                                         const DominatorTree *DT,
-                                        const TargetLibraryInfo *TLI) {
+                                        const TargetLibraryInfo *TLI,
+                                        bool UseVariableInfo) {
   return isSafeToSpeculativelyExecuteWithOpcode(Inst->getOpcode(), Inst, CtxI,
-                                                AC, DT, TLI);
+                                                AC, DT, TLI, UseVariableInfo);
 }
 
 bool llvm::isSafeToSpeculativelyExecuteWithOpcode(
     unsigned Opcode, const Instruction *Inst, const Instruction *CtxI,
-    AssumptionCache *AC, const DominatorTree *DT,
-    const TargetLibraryInfo *TLI) {
+    AssumptionCache *AC, const DominatorTree *DT, const TargetLibraryInfo *TLI,
+    bool UseVariableInfo) {
 #ifndef NDEBUG
   if (Inst->getOpcode() != Opcode) {
     // Check that the operands are actually compatible with the Opcode override.
@@ -6831,6 +6832,9 @@ bool llvm::isSafeToSpeculativelyExecuteWithOpcode(
     return false;
   }
   case Instruction::Load: {
+    if (!UseVariableInfo)
+      return false;
+
     const LoadInst *LI = dyn_cast<LoadInst>(Inst);
     if (!LI)
       return false;
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index c0fc7a2b35ea3..92a03eb52e35d 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -80,7 +80,6 @@ TailMergeThreshold("tail-merge-threshold",
           cl::init(150), cl::Hidden);
 
 // Heuristic for tail merging (and, inversely, tail duplication).
-// TODO: This should be replaced with a target query.
 static cl::opt<unsigned>
 TailMergeSize("tail-merge-size",
               cl::desc("Min number of instructions to consider tail merging"),
@@ -145,8 +144,6 @@ BranchFolder::BranchFolder(bool DefaultEnableTailMerge, bool CommonHoist,
                            ProfileSummaryInfo *PSI, unsigned MinTailLength)
     : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
       MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) {
-  if (MinCommonTailLength == 0)
-    MinCommonTailLength = TailMergeSize;
   switch (FlagEnableTailMerge) {
   case cl::BOU_UNSET:
     EnableTailMerge = DefaultEnableTailMerge;
@@ -195,6 +192,12 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   MLI = mli;
   this->MRI = &MRI;
 
+  if (MinCommonTailLength == 0) {
+    MinCommonTailLength = TailMergeSize.getNumOccurrences() > 0
+                              ? TailMergeSize
+                              : TII->getTailMergeSize(MF);
+  }
+
   UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
   if (!UpdateLiveIns)
     MRI.invalidateLiveness();
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 4cd3d58b80198..04b3233084a41 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -18,6 +18,7 @@
 
 #include <atomic>
 #include <fcntl.h>
+#include <functional>
 #include <thread>
 
 #ifndef _WIN32
@@ -177,70 +178,89 @@ Expected<ListeningSocket> ListeningSocket::createUnix(StringRef SocketPath,
 #endif // _WIN32
 }
 
-Expected<std::unique_ptr<raw_socket_stream>>
-ListeningSocket::accept(std::chrono::milliseconds Timeout) {
-
-  struct pollfd FDs[2];
-  FDs[0].events = POLLIN;
+// If a file descriptor being monitored by ::poll is closed by another thread,
+// the result is unspecified. In the case ::poll does not unblock and return,
+// when ActiveFD is closed, you can provide another file descriptor via CancelFD
+// that when written to will cause poll to return. Typically CancelFD is the
+// read end of a unidirectional pipe.
+//
+// Timeout should be -1 to block indefinitly
+//
+// getActiveFD is a callback to handle ActiveFD's of std::atomic<int> and int
+static std::error_code
+manageTimeout(const std::chrono::milliseconds &Timeout,
+              const std::function<int()> &getActiveFD,
+              const std::optional<int> &CancelFD = std::nullopt) {
+  struct pollfd FD[2];
+  FD[0].events = POLLIN;
 #ifdef _WIN32
-  SOCKET WinServerSock = _get_osfhandle(FD);
-  FDs[0].fd = WinServerSock;
+  SOCKET WinServerSock = _get_osfhandle(getActiveFD());
+  FD[0].fd = WinServerSock;
 #else
-  FDs[0].fd = FD;
+  FD[0].fd = getActiveFD();
 #endif
-  FDs[1].events = POLLIN;
-  FDs[1].fd = PipeFD[0];
-
-  // Keep track of how much time has passed in case poll is interupted by a
-  // signal and needs to be recalled
-  int RemainingTime = Timeout.count();
-  std::chrono::milliseconds ElapsedTime = std::chrono::milliseconds(0);
-  int PollStatus = -1;
-
-  while (PollStatus == -1 && (Timeout.count() == -1 || ElapsedTime < Timeout)) {
-    if (Timeout.count() != -1)
-      RemainingTime -= ElapsedTime.count();
+  uint8_t FDCount = 1;
+  if (CancelFD.has_value()) {
+    FD[1].events = POLLIN;
+    FD[1].fd = CancelFD.value();
+    FDCount++;
+  }
 
-    auto Start = std::chrono::steady_clock::now();
+  // Keep track of how much time has passed in case ::poll or WSAPoll are
+  // interupted by a signal and need to be recalled
+  auto Start = std::chrono::steady_clock::now();
+  auto RemainingTimeout = Timeout;
+  int PollStatus = 0;
+  do {
+    // If Timeout is -1 then poll should block and RemainingTimeout does not
+    // need to be recalculated
+    if (PollStatus != 0 && Timeout != std::chrono::milliseconds(-1)) {
+      auto TotalElapsedTime =
+          std::chrono::duration_cast<std::chrono::milliseconds>(
+              std::chrono::steady_clock::now() - Start);
+
+      if (TotalElapsedTime >= Timeout)
+        return std::make_error_code(std::errc::operation_would_block);
+
+      RemainingTimeout = Timeout - TotalElapsedTime;
+    }
 #ifdef _WIN32
-    PollStatus = WSAPoll(FDs, 2, RemainingTime);
+    PollStatus = WSAPoll(FD, FDCount, RemainingTimeout.count());
+  } while (PollStatus == SOCKET_ERROR &&
+           getLastSocketErrorCode() == std::errc::interrupted);
 #else
-    PollStatus = ::poll(FDs, 2, RemainingTime);
+    PollStatus = ::poll(FD, FDCount, RemainingTimeout.count());
+  } while (PollStatus == -1 &&
+           getLastSocketErrorCode() == std::errc::interrupted);
 #endif
-    // If FD equals -1 then ListeningSocket::shutdown has been called and it is
-    // appropriate to return operation_canceled
-    if (FD.load() == -1)
-      return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::operation_canceled),
-          "Accept canceled");
 
+  // If ActiveFD equals -1 or CancelFD has data to be read then the operation
+  // has been canceled by another thread
+  if (getActiveFD() == -1 || (CancelFD.has_value() && FD[1].revents & POLLIN))
+    return std::make_error_code(std::errc::operation_canceled);
 #if _WIN32
-    if (PollStatus == SOCKET_ERROR) {
+  if (PollStatus == SOCKET_ERROR)
 #else
-    if (PollStatus == -1) {
+  if (PollStatus == -1)
 #endif
-      std::error_code PollErrCode = getLastSocketErrorCode();
-      // Ignore EINTR (signal occured before any request event) and retry
-      if (PollErrCode != std::errc::interrupted)
-        return llvm::make_error<StringError>(PollErrCode, "FD poll failed");
-    }
-    if (PollStatus == 0)
-      return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::timed_out),
-          "No client requests within timeout window");
-
-    if (FDs[0].revents & POLLNVAL)
-      return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::bad_file_descriptor));
+    return getLastSocketErrorCode();
+  if (PollStatus == 0)
+    return std::make_error_code(std::errc::timed_out);
+  if (FD[0].revents & POLLNVAL)
+    return std::make_error_code(std::errc::bad_file_descriptor);
+  return std::error_code();
+}
 
-    auto Stop = std::chrono::steady_clock::now();
-    ElapsedTime +=
-        std::chrono::duration_cast<std::chrono::milliseconds>(Stop - Start);
-  }
+Expected<std::unique_ptr<raw_socket_stream>>
+ListeningSocket::accept(const std::chrono::milliseconds &Timeout) {
+  auto getActiveFD = [this]() -> int { return FD; };
+  std::error_code TimeoutErr = manageTimeout(Timeout, getActiveFD, PipeFD[0]);
+  if (TimeoutErr)
+    return llvm::make_error<StringError>(TimeoutErr, "Timeout error");
 
   int AcceptFD;
 #ifdef _WIN32
-  SOCKET WinAcceptSock = ::accept(WinServerSock, NULL, NULL);
+  SOCKET WinAcceptSock = ::accept(_get_osfhandle(FD), NULL, NULL);
   AcceptFD = _open_osfhandle(WinAcceptSock, 0);
 #else
   AcceptFD = ::accept(FD, NULL, NULL);
@@ -295,6 +315,8 @@ ListeningSocket::~ListeningSocket() {
 raw_socket_stream::raw_socket_stream(int SocketFD)
     : raw_fd_stream(SocketFD, true) {}
 
+raw_socket_stream::~raw_socket_stream() {}
+
 Expected<std::unique_ptr<raw_socket_stream>>
 raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
 #ifdef _WIN32
@@ -306,4 +328,14 @@ raw_socket_stream::createConnectedUnix(StringRef SocketPath) {
   return std::make_unique<raw_socket_stream>(*FD);
 }
 
-raw_socket_stream::~raw_socket_stream() {}
+ssize_t raw_socket_stream::read(char *Ptr, size_t Size,
+                                const std::chrono::milliseconds &Timeout) {
+  auto getActiveFD = [this]() -> int { return this->get_fd(); };
+  std::error_code Err = manageTimeout(Timeout, getActiveFD);
+  // Mimic raw_fd_stream::read error handling behavior
+  if (Err) {
+    raw_fd_stream::error_detected(Err);
+    return -1;
+  }
+  return raw_fd_stream::read(Ptr, Size);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 452dac4b00993..bd4203ccd6fe4 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -217,7 +217,7 @@ class SIMemOpInfo final {
 
 class SIMemOpAccess final {
 private:
-  AMDGPUMachineModuleInfo *MMI = nullptr;
+  const AMDGPUMachineModuleInfo *MMI = nullptr;
 
   /// Reports unsupported message \p Msg for \p MI to LLVM context.
   void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -241,7 +241,7 @@ class SIMemOpAccess final {
 public:
   /// Construct class to support accessing the machine memory operands
   /// of instructions in the machine function \p MF.
-  SIMemOpAccess(MachineFunction &MF);
+  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
 
   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
@@ -806,9 +806,8 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
   return SIAtomicAddrSpace::OTHER;
 }
 
-SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
-  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
-}
+SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
+    : MMI(&MMI_) {}
 
 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) const {
@@ -2802,7 +2801,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
-  SIMemOpAccess MOA(MF);
+  const MachineModuleInfo &MMI =
+      getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
 
   for (auto &MBB : MF) {
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index b75cd1beadc58..4ef009c87a1e6 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -829,7 +829,7 @@ bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
     return false;
 
   Value *X = nullptr;
-  if (!match(C, m_c_And(m_Value(X), m_One())))
+  if (!match(C, m_And(m_Value(X), m_One())))
     return false;
   // Matched: select (X & 1) == +++ ? ... : ...
   //          select (X & 1) != +++ ? ... : ...
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ba6be85c7f2e8..496f126b7173d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -335,6 +335,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::SETCC);
 
   // Set DAG combine for 'LSX' feature.
 
@@ -2528,6 +2529,169 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static bool checkValueWidth(SDValue V, ISD::LoadExtType &ExtType) {
+  ExtType = ISD::NON_EXTLOAD;
+
+  switch (V.getNode()->getOpcode()) {
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+    if ((LoadNode->getMemoryVT() == MVT::i8) ||
+        (LoadNode->getMemoryVT() == MVT::i16)) {
+      ExtType = LoadNode->getExtensionType();
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertSext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8) || (TypeNode->getVT() == MVT::i16)) {
+      ExtType = ISD::SEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertZext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8) || (TypeNode->getVT() == MVT::i16)) {
+      ExtType = ISD::ZEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+
+  return false;
+}
+
+// Eliminate redundant truncation and zero-extension nodes.
+// * Case 1:
+//  +------------+ +------------+ +------------+
+//  |   Input1   | |   Input2   | |     CC     |
+//  +------------+ +------------+ +------------+
+//         |              |              |
+//         V              V              +----+
+//  +------------+ +------------+             |
+//  |  TRUNCATE  | |  TRUNCATE  |             |
+//  +------------+ +------------+             |
+//         |              |                   |
+//         V              V                   |
+//  +------------+ +------------+             |
+//  |  ZERO_EXT  | |  ZERO_EXT  |             |
+//  +------------+ +------------+             |
+//         |              |                   |
+//         |              +-------------+     |
+//         V              V             |     |
+//        +----------------+            |     |
+//        |      AND       |            |     |
+//        +----------------+            |     |
+//                |                     |     |
+//                +---------------+     |     |
+//                                |     |     |
+//                                V     V     V
+//                               +-------------+
+//                               |     CMP     |
+//                               +-------------+
+// * Case 2:
+//  +------------+ +------------+ +-------------+ +------------+ +------------+
+//  |   Input1   | |   Input2   | | Constant -1 | | Constant 0 | |     CC     |
+//  +------------+ +------------+ +-------------+ +------------+ +------------+
+//         |              |             |               |               |
+//         V              |             |               |               |
+//  +------------+        |             |               |               |
+//  |     XOR    |<---------------------+               |               |
+//  +------------+        |                             |               |
+//         |              |                             |               |
+//         V              V             +---------------+               |
+//  +------------+ +------------+       |                               |
+//  |  TRUNCATE  | |  TRUNCATE  |       |     +-------------------------+
+//  +------------+ +------------+       |     |
+//         |              |             |     |
+//         V              V             |     |
+//  +------------+ +------------+       |     |
+//  |  ZERO_EXT  | |  ZERO_EXT  |       |     |
+//  +------------+ +------------+       |     |
+//         |              |             |     |
+//         V              V             |     |
+//        +----------------+            |     |
+//        |      AND       |            |     |
+//        +----------------+            |     |
+//                |                     |     |
+//                +---------------+     |     |
+//                                |     |     |
+//                                V     V     V
+//                               +-------------+
+//                               |     CMP     |
+//                               +-------------+
+static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const LoongArchSubtarget &Subtarget) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+  SDNode *AndNode = N->getOperand(0).getNode();
+  if (AndNode->getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue AndInputValue2 = AndNode->getOperand(1);
+  if (AndInputValue2.getOpcode() != ISD::ZERO_EXTEND)
+    return SDValue();
+
+  SDValue CmpInputValue = N->getOperand(1);
+  SDValue AndInputValue1 = AndNode->getOperand(0);
+  if (AndInputValue1.getOpcode() == ISD::XOR) {
+    if (CC != ISD::SETEQ && CC != ISD::SETNE)
+      return SDValue();
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndInputValue1.getOperand(1));
+    if (!CN || CN->getSExtValue() != -1)
+      return SDValue();
+    CN = dyn_cast<ConstantSDNode>(CmpInputValue);
+    if (!CN || CN->getSExtValue() != 0)
+      return SDValue();
+    AndInputValue1 = AndInputValue1.getOperand(0);
+    if (AndInputValue1.getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+  } else if (AndInputValue1.getOpcode() == ISD::ZERO_EXTEND) {
+    if (AndInputValue2 != CmpInputValue)
+      return SDValue();
+  } else {
+    return SDValue();
+  }
+
+  SDValue TruncValue1 = AndInputValue1.getNode()->getOperand(0);
+  if (TruncValue1.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  SDValue TruncValue2 = AndInputValue2.getNode()->getOperand(0);
+  if (TruncValue2.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  SDValue TruncInputValue1 = TruncValue1.getNode()->getOperand(0);
+  SDValue TruncInputValue2 = TruncValue2.getNode()->getOperand(0);
+  ISD::LoadExtType ExtType1;
+  ISD::LoadExtType ExtType2;
+
+  if (!checkValueWidth(TruncInputValue1, ExtType1) ||
+      !checkValueWidth(TruncInputValue2, ExtType2))
+    return SDValue();
+
+  if (TruncInputValue1->getValueType(0) != TruncInputValue2->getValueType(0) ||
+      AndNode->getValueType(0) != TruncInputValue1->getValueType(0))
+    return SDValue();
+
+  if ((ExtType2 != ISD::ZEXTLOAD) &&
+      ((ExtType2 != ISD::SEXTLOAD) && (ExtType1 != ISD::SEXTLOAD)))
+    return SDValue();
+
+  // These truncation and zero-extension nodes are not necessary, remove them.
+  SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N), AndNode->getValueType(0),
+                               TruncInputValue1, TruncInputValue2);
+  SDValue NewSetCC =
+      DAG.getSetCC(SDLoc(N), N->getValueType(0), NewAnd, TruncInputValue2, CC);
+  DAG.ReplaceAllUsesWith(N, NewSetCC.getNode());
+  return SDValue(N, 0);
+}
+
 // Combine (loongarch_bitrev_w (loongarch_revb_2w X)) to loongarch_bitrev_4b.
 static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
@@ -3155,6 +3319,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     return performANDCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:
     return performORCombine(N, DAG, DCI, Subtarget);
+  case ISD::SETCC:
+    return performSETCCCombine(N, DAG, DCI, Subtarget);
   case ISD::SRL:
     return performSRLCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::BITREV_W:
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index d74143b487880..0e2811f87c817 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -909,6 +909,23 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
+    assert(!Subtarget->isAIXABI() &&
+           "AIX does not support patchable function entry!");
+    // PATCHABLE_FUNCTION_ENTER on little endian is for XRAY support which is
+    // handled in PPCLinuxAsmPrinter.
+    if (MAI->isLittleEndian())
+      return;
+    const Function &F = MF->getFunction();
+    unsigned Num = 0;
+    (void)F.getFnAttribute("patchable-function-entry")
+        .getValueAsString()
+        .getAsInteger(10, Num);
+    if (!Num)
+      return;
+    emitNops(Num);
+    return;
+  }
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
   case TargetOpcode::STACKMAP:
@@ -1780,7 +1797,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   switch (MI->getOpcode()) {
   default:
-    return PPCAsmPrinter::emitInstruction(MI);
+    break;
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
     // .begin:
     //   b .end # lis 0, FuncId[16..32]
@@ -1793,6 +1810,9 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     //
     // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
     // of instructions change.
+    // XRAY is only supported on PPC Linux little endian.
+    if (!MAI->isLittleEndian())
+      break;
     MCSymbol *BeginOfSled = OutContext.createTempSymbol();
     MCSymbol *EndOfSled = OutContext.createTempSymbol();
     OutStreamer->emitLabel(BeginOfSled);
@@ -1834,7 +1854,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
       IsConditional = false;
     } else {
       EmitToStreamer(*OutStreamer, RetInst);
-      break;
+      return;
     }
 
     MCSymbol *FallthroughLabel;
@@ -1899,7 +1919,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (IsConditional)
       OutStreamer->emitLabel(FallthroughLabel);
     recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT, 2);
-    break;
+    return;
   }
   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
     llvm_unreachable("PATCHABLE_FUNCTION_EXIT should never be emitted");
@@ -1909,6 +1929,7 @@ void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
     llvm_unreachable("Tail call is handled in the normal case. See comments "
                      "around this assert.");
   }
+  return PPCAsmPrinter::emitInstruction(MI);
 }
 
 void PPCLinuxAsmPrinter::emitStartOfAsmFile(Module &M) {
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index c9979b2b36fc3..c96d2dd83816b 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -37,6 +37,15 @@ class RISCVExtension<string name, int major, int minor, string desc,
   bit Experimental = false;
 }
 
+// The groupID/bitmask of RISCVExtension is used to retrieve a specific bit value 
+// from __riscv_feature_bits based on the groupID and bitmask.
+// groupID - groupID of extension
+// bitPos  - bit position of extension bitmask
+class RISCVExtensionBitmask<bits<3> groupID, int bitPos> {
+    int GroupID = groupID;
+    int BitPos = bitPos;
+}
+
 // Version of RISCVExtension to be used for Experimental extensions. This
 // sets the Experimental flag and prepends experimental- to the -mattr name.
 class RISCVExperimentalExtension<string name, int major, int minor, string desc,
@@ -52,7 +61,8 @@ class RISCVExperimentalExtension<string name, int major, int minor, string desc,
 
 def FeatureStdExtI
     : RISCVExtension<"i", 2, 1,
-                     "'I' (Base Integer Instruction Set)">;
+                     "'I' (Base Integer Instruction Set)">,
+      RISCVExtensionBitmask<0, 8>;
 
 def FeatureStdExtE
     : RISCVExtension<"e", 2, 0,
@@ -78,7 +88,8 @@ def HasStdExtZicbop : Predicate<"Subtarget->hasStdExtZicbop()">,
 
 def FeatureStdExtZicboz
     : RISCVExtension<"zicboz", 1, 0,
-                     "'Zicboz' (Cache-Block Zero Instructions)">;
+                     "'Zicboz' (Cache-Block Zero Instructions)">,
+      RISCVExtensionBitmask<0, 37>;
 def HasStdExtZicboz : Predicate<"Subtarget->hasStdExtZicboz()">,
                       AssemblerPredicate<(all_of FeatureStdExtZicboz),
                           "'Zicboz' (Cache-Block Zero Instructions)">;
@@ -113,7 +124,8 @@ def FeatureStdExtZicntr
 
 def FeatureStdExtZicond
     : RISCVExtension<"zicond", 1, 0,
-                     "'Zicond' (Integer Conditional Operations)">;
+                     "'Zicond' (Integer Conditional Operations)">,
+      RISCVExtensionBitmask<0, 38>;
 def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
                       AssemblerPredicate<(all_of FeatureStdExtZicond),
                           "'Zicond' (Integer Conditional Operations)">;
@@ -127,14 +139,16 @@ def HasStdExtZifencei : Predicate<"Subtarget->hasStdExtZifencei()">,
 
 def FeatureStdExtZihintpause
     : RISCVExtension<"zihintpause", 2, 0,
-                     "'Zihintpause' (Pause Hint)">;
+                     "'Zihintpause' (Pause Hint)">,
+      RISCVExtensionBitmask<0, 40>;
 def HasStdExtZihintpause : Predicate<"Subtarget->hasStdExtZihintpause()">,
                            AssemblerPredicate<(all_of FeatureStdExtZihintpause),
                                               "'Zihintpause' (Pause Hint)">;
 
 def FeatureStdExtZihintntl
     : RISCVExtension<"zihintntl", 1, 0,
-                     "'Zihintntl' (Non-Temporal Locality Hints)">;
+                     "'Zihintntl' (Non-Temporal Locality Hints)">,
+      RISCVExtensionBitmask<0, 39>;
 def HasStdExtZihintntl : Predicate<"Subtarget->hasStdExtZihintntl()">,
                          AssemblerPredicate<(all_of FeatureStdExtZihintntl),
                              "'Zihintntl' (Non-Temporal Locality Hints)">;
@@ -181,7 +195,8 @@ def HasStdExtZmmul : Predicate<"Subtarget->hasStdExtZmmul()">,
 def FeatureStdExtM
     : RISCVExtension<"m", 2, 0,
                      "'M' (Integer Multiplication and Division)",
-                     [FeatureStdExtZmmul]>;
+                     [FeatureStdExtZmmul]>,
+      RISCVExtensionBitmask<0, 12>;
 def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
                  AssemblerPredicate<(all_of FeatureStdExtM),
                      "'M' (Integer Multiplication and Division)">;
@@ -190,14 +205,16 @@ def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
 
 def FeatureStdExtA
     : RISCVExtension<"a", 2, 1,
-                     "'A' (Atomic Instructions)">;
+                     "'A' (Atomic Instructions)">,
+      RISCVExtensionBitmask<0, 0>;
 def HasStdExtA : Predicate<"Subtarget->hasStdExtA()">,
                  AssemblerPredicate<(all_of FeatureStdExtA),
                                     "'A' (Atomic Instructions)">;
 
 def FeatureStdExtZtso
     : RISCVExtension<"ztso", 1, 0,
-                     "'Ztso' (Memory Model - Total Store Order)">;
+                     "'Ztso' (Memory Model - Total Store Order)">,
+      RISCVExtensionBitmask<0, 47>;
 def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZtso()">,
                     AssemblerPredicate<(all_of FeatureStdExtZtso),
                         "'Ztso' (Memory Model - Total Store Order)">;
@@ -227,7 +244,8 @@ def HasStdExtZabha : Predicate<"Subtarget->hasStdExtZabha()">,
 
 def FeatureStdExtZacas
     : RISCVExtension<"zacas", 1, 0,
-                     "'Zacas' (Atomic Compare-And-Swap Instructions)">;
+                     "'Zacas' (Atomic Compare-And-Swap Instructions)">,
+      RISCVExtensionBitmask<0, 26>;
 def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
                      AssemblerPredicate<(all_of FeatureStdExtZacas),
                          "'Zacas' (Atomic Compare-And-Swap Instructions)">;
@@ -264,7 +282,8 @@ def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">,
 def FeatureStdExtF
     : RISCVExtension<"f", 2, 2,
                      "'F' (Single-Precision Floating-Point)",
-                     [FeatureStdExtZicsr]>;
+                     [FeatureStdExtZicsr]>,
+      RISCVExtensionBitmask<0, 5>;
 def HasStdExtF : Predicate<"Subtarget->hasStdExtF()">,
                  AssemblerPredicate<(all_of FeatureStdExtF),
                                     "'F' (Single-Precision Floating-Point)">;
@@ -272,7 +291,8 @@ def HasStdExtF : Predicate<"Subtarget->hasStdExtF()">,
 def FeatureStdExtD
     : RISCVExtension<"d", 2, 2,
                      "'D' (Double-Precision Floating-Point)",
-                     [FeatureStdExtF]>;
+                     [FeatureStdExtF]>,
+      RISCVExtensionBitmask<0, 3>;
 def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
                  AssemblerPredicate<(all_of FeatureStdExtD),
                                     "'D' (Double-Precision Floating-Point)">;
@@ -280,7 +300,8 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
 def FeatureStdExtZfhmin
     : RISCVExtension<"zfhmin", 1, 0,
                      "'Zfhmin' (Half-Precision Floating-Point Minimal)",
-                     [FeatureStdExtF]>;
+                     [FeatureStdExtF]>,
+      RISCVExtensionBitmask<0, 36>;
 def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
                       AssemblerPredicate<(all_of FeatureStdExtZfhmin),
                           "'Zfh' (Half-Precision Floating-Point) or "
@@ -289,7 +310,8 @@ def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
 def FeatureStdExtZfh
     : RISCVExtension<"zfh", 1, 0,
                      "'Zfh' (Half-Precision Floating-Point)",
-                     [FeatureStdExtZfhmin]>;
+                     [FeatureStdExtZfhmin]>,
+      RISCVExtensionBitmask<0, 35>;
 def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
                    AssemblerPredicate<(all_of FeatureStdExtZfh),
                        "'Zfh' (Half-Precision Floating-Point)">;
@@ -313,7 +335,8 @@ def HasHalfFPLoadStoreMove
 def FeatureStdExtZfa
     : RISCVExtension<"zfa", 1, 0,
                      "'Zfa' (Additional Floating-Point)",
-                     [FeatureStdExtF]>;
+                     [FeatureStdExtF]>,
+      RISCVExtensionBitmask<0, 34>;
 def HasStdExtZfa : Predicate<"Subtarget->hasStdExtZfa()">,
                    AssemblerPredicate<(all_of FeatureStdExtZfa),
                                       "'Zfa' (Additional Floating-Point)">;
@@ -356,7 +379,8 @@ def NoStdExtZhinx : Predicate<"!Subtarget->hasStdExtZhinx()">;
 
 def FeatureStdExtC
     : RISCVExtension<"c", 2, 0,
-                     "'C' (Compressed Instructions)">;
+                     "'C' (Compressed Instructions)">,
+      RISCVExtensionBitmask<0, 2>;
 def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
                  AssemblerPredicate<(all_of FeatureStdExtC),
                                     "'C' (Compressed Instructions)">;
@@ -445,7 +469,8 @@ def HasStdExtZcmop : Predicate<"Subtarget->hasStdExtZcmop()">,
 
 def FeatureStdExtZba
     : RISCVExtension<"zba", 1, 0,
-                     "'Zba' (Address Generation Instructions)">;
+                     "'Zba' (Address Generation Instructions)">,
+      RISCVExtensionBitmask<0, 27>;
 def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">,
                    AssemblerPredicate<(all_of FeatureStdExtZba),
                                       "'Zba' (Address Generation Instructions)">;
@@ -453,7 +478,8 @@ def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">;
 
 def FeatureStdExtZbb
     : RISCVExtension<"zbb", 1, 0,
-                     "'Zbb' (Basic Bit-Manipulation)">;
+                     "'Zbb' (Basic Bit-Manipulation)">,
+      RISCVExtensionBitmask<0, 28>;
 def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbb),
                                       "'Zbb' (Basic Bit-Manipulation)">;
@@ -462,14 +488,16 @@ def NoStdExtZbb : Predicate<"!Subtarget->hasStdExtZbb()">,
 
 def FeatureStdExtZbc
     : RISCVExtension<"zbc", 1, 0,
-                     "'Zbc' (Carry-Less Multiplication)">;
+                     "'Zbc' (Carry-Less Multiplication)">,
+      RISCVExtensionBitmask<0, 29>;
 def HasStdExtZbc : Predicate<"Subtarget->hasStdExtZbc()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbc),
                                       "'Zbc' (Carry-Less Multiplication)">;
 
 def FeatureStdExtZbs
     : RISCVExtension<"zbs", 1, 0,
-                     "'Zbs' (Single-Bit Instructions)">;
+                     "'Zbs' (Single-Bit Instructions)">,
+      RISCVExtensionBitmask<0, 33>;
 def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
                    AssemblerPredicate<(all_of FeatureStdExtZbs),
                                       "'Zbs' (Single-Bit Instructions)">;
@@ -486,14 +514,16 @@ def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">,
 
 def FeatureStdExtZbkb
     : RISCVExtension<"zbkb", 1, 0,
-                     "'Zbkb' (Bitmanip instructions for Cryptography)">;
+                     "'Zbkb' (Bitmanip instructions for Cryptography)">,
+      RISCVExtensionBitmask<0, 30>;
 def HasStdExtZbkb : Predicate<"Subtarget->hasStdExtZbkb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZbkb),
                         "'Zbkb' (Bitmanip instructions for Cryptography)">;
 
 def FeatureStdExtZbkx
     : RISCVExtension<"zbkx", 1, 0,
-                     "'Zbkx' (Crossbar permutation instructions)">;
+                     "'Zbkx' (Crossbar permutation instructions)">,
+      RISCVExtensionBitmask<0, 32>;
 def HasStdExtZbkx : Predicate<"Subtarget->hasStdExtZbkx()">,
                     AssemblerPredicate<(all_of FeatureStdExtZbkx),
                         "'Zbkx' (Crossbar permutation instructions)">;
@@ -510,7 +540,8 @@ def HasStdExtZbbOrZbkb
 def FeatureStdExtZbkc
     : RISCVExtension<"zbkc", 1, 0,
                      "'Zbkc' (Carry-less multiply instructions for "
-                     "Cryptography)">;
+                     "Cryptography)">,
+      RISCVExtensionBitmask<0, 31>;
 def HasStdExtZbkc
     : Predicate<"Subtarget->hasStdExtZbkc()">,
       AssemblerPredicate<(all_of FeatureStdExtZbkc),
@@ -527,14 +558,16 @@ def HasStdExtZbcOrZbkc
 
 def FeatureStdExtZknd
     : RISCVExtension<"zknd", 1, 0,
-                     "'Zknd' (NIST Suite: AES Decryption)">;
+                     "'Zknd' (NIST Suite: AES Decryption)">,
+      RISCVExtensionBitmask<0, 41>;
 def HasStdExtZknd : Predicate<"Subtarget->hasStdExtZknd()">,
                     AssemblerPredicate<(all_of FeatureStdExtZknd),
                                        "'Zknd' (NIST Suite: AES Decryption)">;
 
 def FeatureStdExtZkne
     : RISCVExtension<"zkne", 1, 0,
-                     "'Zkne' (NIST Suite: AES Encryption)">;
+                     "'Zkne' (NIST Suite: AES Encryption)">,
+      RISCVExtensionBitmask<0, 42>;
 def HasStdExtZkne : Predicate<"Subtarget->hasStdExtZkne()">,
                     AssemblerPredicate<(all_of FeatureStdExtZkne),
                                        "'Zkne' (NIST Suite: AES Encryption)">;
@@ -549,21 +582,24 @@ def HasStdExtZkndOrZkne
 
 def FeatureStdExtZknh
     : RISCVExtension<"zknh", 1, 0,
-                     "'Zknh' (NIST Suite: Hash Function Instructions)">;
+                     "'Zknh' (NIST Suite: Hash Function Instructions)">,
+      RISCVExtensionBitmask<0, 43>;
 def HasStdExtZknh : Predicate<"Subtarget->hasStdExtZknh()">,
                     AssemblerPredicate<(all_of FeatureStdExtZknh),
                         "'Zknh' (NIST Suite: Hash Function Instructions)">;
 
 def FeatureStdExtZksed
     : RISCVExtension<"zksed", 1, 0,
-                     "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
+                     "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">,
+      RISCVExtensionBitmask<0, 44>;
 def HasStdExtZksed : Predicate<"Subtarget->hasStdExtZksed()">,
                      AssemblerPredicate<(all_of FeatureStdExtZksed),
                          "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
 
 def FeatureStdExtZksh
     : RISCVExtension<"zksh", 1, 0,
-                     "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
+                     "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">,
+      RISCVExtensionBitmask<0, 45>;
 def HasStdExtZksh : Predicate<"Subtarget->hasStdExtZksh()">,
                     AssemblerPredicate<(all_of FeatureStdExtZksh),
                         "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
@@ -596,7 +632,8 @@ def FeatureStdExtZks
 
 def FeatureStdExtZkt
     : RISCVExtension<"zkt", 1, 0,
-                     "'Zkt' (Data Independent Execution Latency)">;
+                     "'Zkt' (Data Independent Execution Latency)">,
+      RISCVExtensionBitmask<0, 46>;
 
 def FeatureStdExtZk
     : RISCVExtension<"zk", 1, 0,
@@ -626,6 +663,7 @@ def FeatureStdExtZve32x
                      "with maximal 32 EEW)",
                      [FeatureStdExtZicsr, FeatureStdExtZvl32b]>;
 
+
 def FeatureStdExtZve32f
     : RISCVExtension<"zve32f", 1, 0,
                      "'Zve32f' (Vector Extensions for Embedded Processors "
@@ -653,7 +691,8 @@ def FeatureStdExtZve64d
 def FeatureStdExtV
     : RISCVExtension<"v", 1, 0,
                      "'V' (Vector Extension for Application Processors)",
-                     [FeatureStdExtZvl128b, FeatureStdExtZve64d]>;
+                     [FeatureStdExtZvl128b, FeatureStdExtZve64d]>,
+      RISCVExtensionBitmask<0, 21>;
 
 def FeatureStdExtZvfbfmin
     : RISCVExtension<"zvfbfmin", 1, 0, "'Zvbfmin' (Vector BF16 Converts)",
@@ -673,12 +712,14 @@ def HasStdExtZvfbfwma : Predicate<"Subtarget->hasStdExtZvfbfwma()">,
 def FeatureStdExtZvfhmin
     : RISCVExtension<"zvfhmin", 1, 0,
                      "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal)",
-                     [FeatureStdExtZve32f]>;
+                     [FeatureStdExtZve32f]>,
+      RISCVExtensionBitmask<0, 51>;
 
 def FeatureStdExtZvfh
     : RISCVExtension<"zvfh", 1, 0,
                      "'Zvfh' (Vector Half-Precision Floating-Point)",
-                     [FeatureStdExtZvfhmin, FeatureStdExtZfhmin]>;
+                     [FeatureStdExtZvfhmin, FeatureStdExtZfhmin]>,
+      RISCVExtensionBitmask<0, 50>;
 
 def HasStdExtZfhOrZvfh
     : Predicate<"Subtarget->hasStdExtZfh() || Subtarget->hasStdExtZvfh()">,
@@ -690,7 +731,8 @@ def HasStdExtZfhOrZvfh
 
 def FeatureStdExtZvkb
     : RISCVExtension<"zvkb", 1, 0,
-                     "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
+                     "'Zvkb' (Vector Bit-manipulation used in Cryptography)">,
+      RISCVExtensionBitmask<0, 52>;
 def HasStdExtZvkb : Predicate<"Subtarget->hasStdExtZvkb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvkb),
                         "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
@@ -698,35 +740,40 @@ def HasStdExtZvkb : Predicate<"Subtarget->hasStdExtZvkb()">,
 def FeatureStdExtZvbb
     : RISCVExtension<"zvbb", 1, 0,
                      "'Zvbb' (Vector basic bit-manipulation instructions)",
-                     [FeatureStdExtZvkb]>;
+                     [FeatureStdExtZvkb]>,
+      RISCVExtensionBitmask<0, 48>;
 def HasStdExtZvbb : Predicate<"Subtarget->hasStdExtZvbb()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvbb),
                         "'Zvbb' (Vector basic bit-manipulation instructions)">;
 
 def FeatureStdExtZvbc
     : RISCVExtension<"zvbc", 1, 0,
-                     "'Zvbc' (Vector Carryless Multiplication)">;
+                     "'Zvbc' (Vector Carryless Multiplication)">,
+      RISCVExtensionBitmask<0, 49>;
 def HasStdExtZvbc : Predicate<"Subtarget->hasStdExtZvbc()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvbc),
                         "'Zvbc' (Vector Carryless Multiplication)">;
 
 def FeatureStdExtZvkg
     : RISCVExtension<"zvkg", 1, 0,
-                     "'Zvkg' (Vector GCM instructions for Cryptography)">;
+                     "'Zvkg' (Vector GCM instructions for Cryptography)">,
+      RISCVExtensionBitmask<0, 53>;
 def HasStdExtZvkg : Predicate<"Subtarget->hasStdExtZvkg()">,
                     AssemblerPredicate<(all_of FeatureStdExtZvkg),
                         "'Zvkg' (Vector GCM instructions for Cryptography)">;
 
 def FeatureStdExtZvkned
     : RISCVExtension<"zvkned", 1, 0,
-                     "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
+                     "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">,
+      RISCVExtensionBitmask<0, 54>;
 def HasStdExtZvkned : Predicate<"Subtarget->hasStdExtZvkned()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvkned),
                           "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
 
 def FeatureStdExtZvknha
     : RISCVExtension<"zvknha", 1, 0,
-                     "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
+                     "'Zvknha' (Vector SHA-2 (SHA-256 only))">,
+      RISCVExtensionBitmask<0, 55>;
 def HasStdExtZvknha : Predicate<"Subtarget->hasStdExtZvknha()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvknha),
                           "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
@@ -734,7 +781,8 @@ def HasStdExtZvknha : Predicate<"Subtarget->hasStdExtZvknha()">,
 def FeatureStdExtZvknhb
     : RISCVExtension<"zvknhb", 1, 0,
                      "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))",
-                     [FeatureStdExtZve64x]>;
+                     [FeatureStdExtZve64x]>,
+      RISCVExtensionBitmask<0, 56>;
 def HasStdExtZvknhb : Predicate<"Subtarget->hasStdExtZvknhb()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvknhb),
                           "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))">;
@@ -745,21 +793,24 @@ def HasStdExtZvknhaOrZvknhb : Predicate<"Subtarget->hasStdExtZvknha() || Subtarg
 
 def FeatureStdExtZvksed
     : RISCVExtension<"zvksed", 1, 0,
-                     "'Zvksed' (SM4 Block Cipher Instructions)">;
+                     "'Zvksed' (SM4 Block Cipher Instructions)">,
+      RISCVExtensionBitmask<0, 57>;
 def HasStdExtZvksed : Predicate<"Subtarget->hasStdExtZvksed()">,
                       AssemblerPredicate<(all_of FeatureStdExtZvksed),
                           "'Zvksed' (SM4 Block Cipher Instructions)">;
 
 def FeatureStdExtZvksh
     : RISCVExtension<"zvksh", 1, 0,
-                     "'Zvksh' (SM3 Hash Function Instructions)">;
+                     "'Zvksh' (SM3 Hash Function Instructions)">,
+      RISCVExtensionBitmask<0, 58>;
 def HasStdExtZvksh : Predicate<"Subtarget->hasStdExtZvksh()">,
                      AssemblerPredicate<(all_of FeatureStdExtZvksh),
                          "'Zvksh' (SM3 Hash Function Instructions)">;
 
 def FeatureStdExtZvkt
     : RISCVExtension<"zvkt", 1, 0,
-                     "'Zvkt' (Vector Data-Independent Execution Latency)">;
+                     "'Zvkt' (Vector Data-Independent Execution Latency)">,
+      RISCVExtensionBitmask<0, 59>;
 
 // Zvk short-hand extensions
 
@@ -796,7 +847,6 @@ def FeatureStdExtZvksg
     : RISCVExtension<"zvksg", 1, 0,
                      "'Zvksg' (shorthand for 'Zvks' and 'Zvkg')",
                      [FeatureStdExtZvks, FeatureStdExtZvkg]>;
-
 // Vector instruction predicates
 
 def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index f9eef60f77b7a..5a92d6bab31a9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1688,7 +1688,6 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
                                          Args, CxtI);
 
-
   auto getConstantMatCost =
     [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
     if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
@@ -1760,8 +1759,14 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
                                                            Op1Info, Op2Info,
                                                            Args, CxtI);
   }
-  return ConstantMatCost +
-         LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
+
+  InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
+  // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
+  // ops are twice as expensive as integer ops. Do the same for vectors so
+  // scalar floating point ops aren't cheaper than their vector equivalents.
+  if (Ty->isFPOrFPVectorTy())
+    InstrCost *= 2;
+  return ConstantMatCost + LT.first * InstrCost;
 }
 
 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index db1b5f689d7da..49a35bfcf4b9b 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -128,6 +128,22 @@ void getFeaturesForCPU(StringRef CPU,
     else
       EnabledFeatures.push_back(F.substr(1));
 }
+
+namespace RISCVExtensionBitmaskTable {
+#define GET_RISCVExtensionBitmaskTable_IMPL
+#include "llvm/TargetParser/RISCVTargetParserDef.inc"
+
+} // namespace RISCVExtensionBitmaskTable
+
+namespace {
+struct LessExtName {
+  bool operator()(const RISCVExtensionBitmaskTable::RISCVExtensionBitmask &LHS,
+                  StringRef RHS) {
+    return StringRef(LHS.Name) < RHS;
+  }
+};
+} // namespace
+
 } // namespace RISCV
 
 namespace RISCVVType {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e387034110df9..aaf4ece3249a2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1247,8 +1247,11 @@ bool InstCombinerImpl::replaceInInstruction(Value *V, Value *Old, Value *New,
   if (Depth == 2)
     return false;
 
+  assert(!isa<Constant>(Old) && "Only replace non-constant values");
+
   auto *I = dyn_cast<Instruction>(V);
-  if (!I || !I->hasOneUse() || !isSafeToSpeculativelyExecute(I))
+  if (!I || !I->hasOneUse() ||
+      !isSafeToSpeculativelyExecuteWithVariableReplaced(I))
     return false;
 
   bool Changed = false;
@@ -1331,7 +1334,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
     // profitability is not clear for other cases.
     // FIXME: Support vectors.
     if (OldOp == CmpLHS && match(NewOp, m_ImmConstant()) &&
-        !match(OldOp, m_ImmConstant()) && !Cmp.getType()->isVectorTy() &&
+        !match(OldOp, m_Constant()) && !Cmp.getType()->isVectorTy() &&
         isGuaranteedNotToBeUndef(NewOp, SQ.AC, &Sel, &DT))
       if (replaceInInstruction(TrueVal, OldOp, NewOp))
         return &Sel;
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 9be60721bebb5..75585fcc80266 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -425,14 +425,12 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
 
 // Returns true if A matches B + C where C is constant.
 static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
-  return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
-          match(A, m_Add(m_ConstantInt(C), m_Value(B))));
+  return match(A, m_c_Add(m_Value(B), m_ConstantInt(C)));
 }
 
 // Returns true if A matches B | C where C is constant.
 static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
-  return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
-          match(A, m_Or(m_ConstantInt(C), m_Value(B))));
+  return match(A, m_c_Or(m_Value(B), m_ConstantInt(C)));
 }
 
 void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index d1e8bb015491e..5236f5a3bae95 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -8,36 +8,36 @@ define i32 @fadd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fadd <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fadd <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fadd <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fadd <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fadd <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fadd <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fadd <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fadd <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fadd <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fadd <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fadd <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fadd <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fadd <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fadd <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fadd <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fadd <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fadd <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fadd <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fadd <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fadd <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fadd <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fadd <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fadd <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fadd <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fadd <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fadd <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fadd <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fadd <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fadd <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fadd <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fadd <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fadd <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fadd <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fadd <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fadd <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fadd <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fadd <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fadd <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fadd half undef, undef
@@ -88,36 +88,36 @@ define i32 @fsub() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fsub <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fsub <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fsub <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fsub <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fsub <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fsub <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fsub <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fsub <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fsub <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fsub <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fsub <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fsub <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fsub <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fsub <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fsub <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fsub <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fsub <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fsub <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fsub <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fsub <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fsub <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fsub <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fsub <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fsub <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fsub <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fsub <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fsub <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fsub <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fsub <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fsub <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fsub <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fsub <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fsub <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fsub <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fsub <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fsub <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fsub <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fsub <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fsub half undef, undef
@@ -168,36 +168,36 @@ define i32 @fmul() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fmul float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fmul <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fmul <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fmul <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fmul <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fmul <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fmul <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fmul <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fmul <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fmul <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fmul <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fmul <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fmul <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fmul <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fmul <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fmul <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fmul <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fmul <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fmul <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fmul <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fmul <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fmul <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fmul <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fmul <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fmul <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fmul <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fmul <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fmul <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fmul <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fmul <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fmul <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fmul <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fmul <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fmul <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fmul <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fmul <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fmul <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fmul <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fmul <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fmul <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fmul <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fmul <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fmul <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fmul <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fmul <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fmul <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fmul <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fmul half undef, undef
@@ -248,36 +248,36 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fdiv <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fdiv <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fdiv <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fdiv <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fdiv <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fdiv <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fdiv <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fdiv <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fdiv <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fdiv <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fdiv <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fdiv <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fdiv <1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fdiv <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fdiv <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fdiv <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fdiv <1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fdiv <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fdiv <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fdiv half undef, undef
@@ -408,36 +408,36 @@ define i32 @fneg() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fneg half undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fneg float undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fneg double undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = fneg <1 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fneg <2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fneg <4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fneg <8 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fneg <16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fneg <32 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = fneg <vscale x 1 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = fneg <vscale x 2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = fneg <vscale x 4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = fneg <vscale x 8 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = fneg <vscale x 16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = fneg <vscale x 32 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = fneg <1 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fneg <2 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fneg <4 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fneg <8 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fneg <16 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = fneg <vscale x 1 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = fneg <vscale x 2 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fneg <vscale x 4 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = fneg <vscale x 8 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = fneg <vscale x 16 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = fneg <1 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fneg <2 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fneg <4 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fneg <8 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = fneg <vscale x 1 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fneg <vscale x 2 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = fneg <vscale x 4 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = fneg <vscale x 8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fneg <1 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fneg <2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fneg <4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fneg <8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fneg <16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fneg <32 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fneg <vscale x 1 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fneg <vscale x 2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fneg <vscale x 4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fneg <vscale x 8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fneg <vscale x 16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fneg <vscale x 32 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fneg <1 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fneg <2 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fneg <4 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fneg <8 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fneg <16 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fneg <vscale x 1 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fneg <vscale x 2 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fneg <vscale x 4 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fneg <vscale x 8 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fneg <vscale x 16 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fneg <1 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fneg <2 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fneg <4 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fneg <8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fneg <vscale x 1 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fneg <vscale x 2 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fneg <vscale x 4 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fneg <vscale x 8 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = fneg half undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
index 87ffb23dcb88e..67c081ba5d3c6 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-intrinsics.ll
@@ -21,7 +21,7 @@ define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 
 define void @powi(<vscale x 4 x float> %vec) {
 ; CHECK-LABEL: 'powi'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> %vec, i32 42)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'powi'
@@ -1383,73 +1383,73 @@ define void @reduce_fadd() {
 
 define void @vp_fadd(){
 ; CHECK-LABEL: 'vp_fadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = fadd <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = fadd <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = fadd <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t7 = fadd <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = fadd <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = fadd <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = fadd <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = fadd <16 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t1 = fadd <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t3 = fadd <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t5 = fadd <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t7 = fadd <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = fadd <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t11 = fadd <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = fadd <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = fadd <16 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'vp_fadd'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = fadd <2 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = fadd <4 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t5 = fadd <8 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t7 = fadd <16 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = fadd <2 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = fadd <4 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = fadd <8 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = fadd <16 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t1 = fadd <2 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t2 = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t3 = fadd <4 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t4 = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t5 = fadd <8 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t6 = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t7 = fadd <16 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = fadd <2 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t11 = fadd <4 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t12 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t13 = fadd <8 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t14 = call <16 x double> @llvm.vp.fadd.v16f64(<16 x double> undef, <16 x double> undef, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = fadd <16 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t17 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t18 = fadd <vscale x 2 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t19 = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t20 = fadd <vscale x 4 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t21 = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t22 = fadd <vscale x 8 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t23 = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t24 = fadd <vscale x 16 x float> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t25 = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t26 = fadd <vscale x 2 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t28 = fadd <vscale x 4 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t29 = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t30 = fadd <vscale x 8 x double> undef, undef
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x double> @llvm.vp.fadd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t32 = fadd <vscale x 16 x double> undef, undef
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t0 = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
diff --git a/llvm/test/CodeGen/LoongArch/andn-icmp.ll b/llvm/test/CodeGen/LoongArch/andn-icmp.ll
index 447f3ac5c34fd..67d62eca914e9 100644
--- a/llvm/test/CodeGen/LoongArch/andn-icmp.ll
+++ b/llvm/test/CodeGen/LoongArch/andn-icmp.ll
@@ -6,14 +6,12 @@ define i1 @andn_icmp_eq_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_eq_i8:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
-; LA32-NEXT:    andi $a0, $a0, 255
 ; LA32-NEXT:    sltui $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_eq_i8:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    andn $a0, $a1, $a0
-; LA64-NEXT:    andi $a0, $a0, 255
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
   %and = and i8 %a, %b
@@ -25,14 +23,12 @@ define i1 @andn_icmp_eq_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_eq_i16:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
-; LA32-NEXT:    bstrpick.w $a0, $a0, 15, 0
 ; LA32-NEXT:    sltui $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_eq_i16:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    andn $a0, $a1, $a0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 15, 0
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
   %and = and i16 %a, %b
@@ -80,14 +76,12 @@ define i1 @andn_icmp_ne_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ne_i8:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
-; LA32-NEXT:    andi $a0, $a0, 255
 ; LA32-NEXT:    sltu $a0, $zero, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ne_i8:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    andn $a0, $a1, $a0
-; LA64-NEXT:    andi $a0, $a0, 255
 ; LA64-NEXT:    sltu $a0, $zero, $a0
 ; LA64-NEXT:    ret
   %and = and i8 %a, %b
@@ -99,14 +93,12 @@ define i1 @andn_icmp_ne_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ne_i16:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
-; LA32-NEXT:    bstrpick.w $a0, $a0, 15, 0
 ; LA32-NEXT:    sltu $a0, $zero, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ne_i16:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    andn $a0, $a1, $a0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 15, 0
 ; LA64-NEXT:    sltu $a0, $zero, $a0
 ; LA64-NEXT:    ret
   %and = and i16 %a, %b
@@ -153,15 +145,13 @@ define i1 @andn_icmp_ne_i64(i64 %a, i64 %b) nounwind {
 define i1 @andn_icmp_ult_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ult_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    andi $a1, $a1, 255
-; LA32-NEXT:    and $a0, $a1, $a0
+; LA32-NEXT:    and $a0, $a0, $a1
 ; LA32-NEXT:    sltu $a0, $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ult_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    andi $a1, $a1, 255
-; LA64-NEXT:    and $a0, $a1, $a0
+; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    sltu $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %and = and i8 %a, %b
@@ -172,15 +162,13 @@ define i1 @andn_icmp_ult_i8(i8 signext %a, i8 signext %b) nounwind {
 define i1 @andn_icmp_ult_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ult_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
-; LA32-NEXT:    and $a0, $a1, $a0
+; LA32-NEXT:    and $a0, $a0, $a1
 ; LA32-NEXT:    sltu $a0, $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ult_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
-; LA64-NEXT:    and $a0, $a1, $a0
+; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    sltu $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %and = and i16 %a, %b
@@ -191,16 +179,14 @@ define i1 @andn_icmp_ult_i16(i16 signext %a, i16 signext %b) nounwind {
 define i1 @andn_icmp_uge_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_uge_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    andi $a1, $a1, 255
-; LA32-NEXT:    and $a0, $a1, $a0
+; LA32-NEXT:    and $a0, $a0, $a1
 ; LA32-NEXT:    sltu $a0, $a0, $a1
 ; LA32-NEXT:    xori $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_uge_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    andi $a1, $a1, 255
-; LA64-NEXT:    and $a0, $a1, $a0
+; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    sltu $a0, $a0, $a1
 ; LA64-NEXT:    xori $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -212,16 +198,14 @@ define i1 @andn_icmp_uge_i8(i8 signext %a, i8 signext %b) nounwind {
 define i1 @andn_icmp_uge_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_uge_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
-; LA32-NEXT:    and $a0, $a1, $a0
+; LA32-NEXT:    and $a0, $a0, $a1
 ; LA32-NEXT:    sltu $a0, $a0, $a1
 ; LA32-NEXT:    xori $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_uge_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
-; LA64-NEXT:    and $a0, $a1, $a0
+; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    sltu $a0, $a0, $a1
 ; LA64-NEXT:    xori $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -233,15 +217,13 @@ define i1 @andn_icmp_uge_i16(i16 signext %a, i16 signext %b) nounwind {
 define i1 @andn_icmp_ugt_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ugt_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    andi $a1, $a1, 255
-; LA32-NEXT:    and $a0, $a1, $a0
+; LA32-NEXT:    and $a0, $a0, $a1
 ; LA32-NEXT:    sltu $a0, $a1, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ugt_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    andi $a1, $a1, 255
-; LA64-NEXT:    and $a0, $a1, $a0
+; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    sltu $a0, $a1, $a0
 ; LA64-NEXT:    ret
   %and = and i8 %a, %b
@@ -252,15 +234,13 @@ define i1 @andn_icmp_ugt_i8(i8 signext %a, i8 signext %b) nounwind {
 define i1 @andn_icmp_ugt_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ugt_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
-; LA32-NEXT:    and $a0, $a1, $a0
+; LA32-NEXT:    and $a0, $a0, $a1
 ; LA32-NEXT:    sltu $a0, $a1, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ugt_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
-; LA64-NEXT:    and $a0, $a1, $a0
+; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    sltu $a0, $a1, $a0
 ; LA64-NEXT:    ret
   %and = and i16 %a, %b
@@ -271,16 +251,14 @@ define i1 @andn_icmp_ugt_i16(i16 signext %a, i16 signext %b) nounwind {
 define i1 @andn_icmp_ule_i8(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ule_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    andi $a1, $a1, 255
-; LA32-NEXT:    and $a0, $a1, $a0
+; LA32-NEXT:    and $a0, $a0, $a1
 ; LA32-NEXT:    sltu $a0, $a1, $a0
 ; LA32-NEXT:    xori $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ule_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    andi $a1, $a1, 255
-; LA64-NEXT:    and $a0, $a1, $a0
+; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    sltu $a0, $a1, $a0
 ; LA64-NEXT:    xori $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -292,16 +270,14 @@ define i1 @andn_icmp_ule_i8(i8 signext %a, i8 signext %b) nounwind {
 define i1 @andn_icmp_ule_i16(i16 signext %a, i16 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_ule_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
-; LA32-NEXT:    and $a0, $a1, $a0
+; LA32-NEXT:    and $a0, $a0, $a1
 ; LA32-NEXT:    sltu $a0, $a1, $a0
 ; LA32-NEXT:    xori $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: andn_icmp_ule_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
-; LA64-NEXT:    and $a0, $a1, $a0
+; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    sltu $a0, $a1, $a0
 ; LA64-NEXT:    xori $a0, $a0, 1
 ; LA64-NEXT:    ret
@@ -606,7 +582,6 @@ define i1 @andn_icmp_eq_i8_i32(i8 signext %a, i8 signext %b) nounwind {
 ; LA32-LABEL: andn_icmp_eq_i8_i32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    andn $a0, $a1, $a0
-; LA32-NEXT:    andi $a0, $a0, 255
 ; LA32-NEXT:    sltui $a0, $a0, 1
 ; LA32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll b/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll
new file mode 100644
index 0000000000000..0c2d2829a6d4b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/patchable-function-entry.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=powerpc %s -o - | FileCheck %s --check-prefixes=CHECK,PPC32
+; RUN: llc -mtriple=powerpc64 %s -o - | FileCheck %s --check-prefixes=CHECK,PPC64
+
+@a = global i32 0, align 4
+
+define void @f0() {
+; CHECK-LABEL: f0:
+; CHECK-NOT:   nop
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+; CHECK-NOT:   .section    __patchable_function_entries
+  ret void
+}
+
+define void @f1() "patchable-function-entry"="0" {
+; CHECK-LABEL: f1:
+; CHECK-NOT:   nop
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+; CHECK-NOT:   .section    __patchable_function_entries
+  ret void
+}
+
+define void @f2() "patchable-function-entry"="1" {
+; CHECK-LABEL: f2:
+; CHECK-LABEL-NEXT:  .Lfunc_begin2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    blr
+; CHECK:       .section    __patchable_function_entries
+; PPC32:       .p2align    2, 0x0
+; PPC64:       .p2align    3, 0x0
+; PPC32-NEXT:  .long   .Lfunc_begin2
+; PPC64-NEXT:  .quad   .Lfunc_begin2
+  ret void
+}
+
+define i32 @f3() "patchable-function-entry"="1" "patchable-function-prefix"="2" {
+; CHECK-LABEL: .Ltmp0:
+; CHECK-COUNT-2: nop
+; CHECK-LABEL: f3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nop
+; PPC32:         lis 3, a@ha
+; PPC32-NEXT:    lwz 3, a@l(3)
+; PPC64:         addis 3, 2, .LC0@toc@ha
+; PPC64-NEXT:    ld 3, .LC0@toc@l(3)
+; PPC64-NEXT:    lwz 3, 0(3)
+; CHECK:         blr
+; CHECK:       .section    __patchable_function_entries
+; PPC32:       .p2align    2, 0x0
+; PPC64:       .p2align    3, 0x0
+; PPC32-NEXT:  .long   .Ltmp0
+; PPC64-NEXT:  .quad   .Ltmp0
+entry:
+  %0 = load i32, ptr @a, align 4
+  ret i32 %0
+}
diff --git a/llvm/test/TableGen/riscv-target-def.td b/llvm/test/TableGen/riscv-target-def.td
index 7137cf96fd3d4..c071cfd731cb5 100644
--- a/llvm/test/TableGen/riscv-target-def.td
+++ b/llvm/test/TableGen/riscv-target-def.td
@@ -12,6 +12,11 @@ class RISCVExtension<string name, int major, int minor, string desc,
   bit Experimental = false;
 }
 
+class RISCVExtensionBitmask<bits<3> groupID, int bitPos> {
+  int GroupID = groupID;
+  int BitPos = bitPos;
+}
+
 class RISCVExperimentalExtension<string name, int major, int minor, string desc,
                                  list<RISCVExtension> implies = [],
                                  string fieldname = !subst("Feature", "Has", NAME),
@@ -23,7 +28,8 @@ class RISCVExperimentalExtension<string name, int major, int minor, string desc,
 
 def FeatureStdExtI
     : RISCVExtension<"i", 2, 1,
-                     "'I' (Base Integer Instruction Set)">;
+                     "'I' (Base Integer Instruction Set)">,
+      RISCVExtensionBitmask<0, 8>;
 
 def FeatureStdExtZicsr
     : RISCVExtension<"zicsr", 2, 0,
@@ -36,7 +42,8 @@ def FeatureStdExtZifencei
 def FeatureStdExtF
     : RISCVExtension<"f", 2, 2,
                      "'F' (Single-Precision Floating-Point)",
-                     [FeatureStdExtZicsr]>;
+                     [FeatureStdExtZicsr]>,
+      RISCVExtensionBitmask<0, 5>;
 
 def FeatureStdExtZidummy
     : RISCVExperimentalExtension<"zidummy", 0, 1,
@@ -171,3 +178,10 @@ def ROCKET : RISCVTuneProcessorModel<"rocket",
 // CHECK-NEXT: TUNE_PROC(ROCKET, "rocket")
 
 // CHECK: #undef TUNE_PROC
+
+// CHECK: #ifdef GET_RISCVExtensionBitmaskTable_IMPL
+// CHECK-NEXT: static const RISCVExtensionBitmask ExtensionBitmask[]={
+// CHECK-NEXT:     {"f", 0, 5ULL},
+// CHECK-NEXT:     {"i", 0, 8ULL},
+// CHECK-NEXT: };
+// CHECK-NEXT: #endif
diff --git a/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
index bd225def3addc..231aba5cc2f0c 100644
--- a/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/memoryssa-scan-limit.ll
@@ -4,6 +4,7 @@
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s
 ; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=3 -S | FileCheck --check-prefix=LIMIT-3 %s
+; RUN: opt < %s -passes=dse -dse-memoryssa-scanlimit=4 -S | FileCheck --check-prefix=LIMIT-4 %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
@@ -71,3 +72,47 @@ bb3:
   store i32 0, ptr %P
   ret void
 }
+
+define void @duplicate_worklist_endoffunction(ptr %ptr.0, ptr %ptr.1) {
+; LIMIT-4-LABEL: @duplicate_worklist_endoffunction(
+; LIMIT-4-NEXT:  entry:
+; LIMIT-4-NEXT:    [[STACK_0:%.*]] = alloca [768 x i8], align 16
+; LIMIT-4-NEXT:    [[VAL_0:%.*]] = load i16, ptr [[PTR_1:%.*]], align 8
+; LIMIT-4-NEXT:    [[COND:%.*]] = icmp ugt i16 [[VAL_0]], 24
+; LIMIT-4-NEXT:    br i1 [[COND]], label [[BB_1:%.*]], label [[EXIT:%.*]]
+; LIMIT-4:       bb.1:
+; LIMIT-4-NEXT:    br label [[LOOP:%.*]]
+; LIMIT-4:       loop:
+; LIMIT-4-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[BB_1]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; LIMIT-4-NEXT:    [[PTR_3:%.*]] = getelementptr i8, ptr [[STACK_0]], i64 [[IV]]
+; LIMIT-4-NEXT:    store ptr [[PTR_0:%.*]], ptr [[PTR_3]], align 2
+; LIMIT-4-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; LIMIT-4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 10
+; LIMIT-4-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
+; LIMIT-4:       exit:
+; LIMIT-4-NEXT:    ret void
+;
+entry:
+  %stack.0 = alloca [768 x i8], align 16
+  %stack.1 = alloca [20 x i8], align 8
+  %val.0 = load i16, ptr %ptr.1, align 8
+  %cond = icmp ugt i16 %val.0, 24
+  br i1 %cond, label %bb.1, label %exit
+
+bb.1:                                             ; preds = %entry
+  %ptr.2 = getelementptr inbounds i8, ptr %ptr.1, i64 8
+  %val.1 = load i64, ptr %ptr.2, align 8
+  store i64 %val.1, ptr %stack.1, align 8
+  br label %loop
+
+loop:                                             ; preds = %loop, %bb.1
+  %iv = phi i64 [ 0, %bb.1 ], [ %iv.next, %loop ]
+  %ptr.3 = getelementptr i8, ptr %stack.0, i64 %iv
+  store ptr %ptr.0, ptr %ptr.3, align 2
+  %iv.next = add nuw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 10
+  br i1 %exitcond, label %exit, label %loop
+
+exit:                                             ; preds = %loop, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
index 1fa0c09a9e987..fb56764598e2d 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -1315,6 +1315,19 @@ define i32 @select_replace_call_speculatable(i32 %x, i32 %y) {
   ret i32 %s
 }
 
+define i32 @select_replace_call_speculatable_intrinsic(i32 %x, i32 %y) {
+; CHECK-LABEL: @select_replace_call_speculatable_intrinsic(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @llvm.smax.i32(i32 [[Y:%.*]], i32 0)
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 [[CALL]], i32 [[Y]]
+; CHECK-NEXT:    ret i32 [[S]]
+;
+  %c = icmp eq i32 %x, 0
+  %call = call i32 @llvm.smax.i32(i32 %x, i32 %y)
+  %s = select i1 %c, i32 %call, i32 %y
+  ret i32 %s
+}
+
 ; We can't replace the call arguments, as the call is not speculatable. We
 ; may end up changing side-effects or causing undefined behavior.
 define i32 @select_replace_call_non_speculatable(i32 %x, i32 %y) {
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index d66ffb9a63ac1..1369be305ec13 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -4713,3 +4713,19 @@ define i8 @select_knownbits_simplify_missing_noundef(i8 %x)  {
   %res = select i1 %cmp, i8 %and, i8 0
   ret i8 %res
 }
+
+@g_ext = external global i8
+
+; Make sure we don't replace %ptr with @g_ext, which may cause the load to trigger UB.
+define i32 @pr99436(ptr align 4 dereferenceable(4) %ptr) {
+; CHECK-LABEL: @pr99436(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR:%.*]], @g_ext
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[CMP]], i32 [[VAL]], i32 0
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %cmp = icmp eq ptr %ptr, @g_ext
+  %val = load i32, ptr %ptr, align 4
+  %ret = select i1 %cmp, i32 %val, i32 0
+  ret i32 %ret
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index e50d7362365b8..a151232df0cd5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -39,32 +39,32 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV32-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV32-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
-; RV32-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
-; RV32-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; RV32-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
-; RV32-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
-; RV32-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; RV32-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; RV32-NEXT:    [[TMP10:%.*]] = mul i64 16, [[TMP9]]
-; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; RV32-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; RV32-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[TMP7]], zeroinitializer
+; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP8]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
+; RV32-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; RV32-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; RV32-NEXT:    [[TMP12:%.*]] = mul i64 16, [[TMP11]]
+; RV32-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
 ; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV32-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV32:       vector.body:
 ; RV32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV32-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope !0
-; RV32-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP13:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP13]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> poison), !alias.scope !3
-; RV32-NEXT:    [[TMP15:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
-; RV32-NEXT:    [[TMP16:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP15]]
-; RV32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP16]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> [[TMP12]]), !alias.scope !5, !noalias !7
-; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope [[META0:![0-9]+]]
+; RV32-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> poison), !alias.scope [[META3:![0-9]+]]
+; RV32-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
+; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
+; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV32-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> [[TMP14]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV32-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -121,32 +121,32 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]]
 ; RV64-NEXT:    [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]]
 ; RV64-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 16
-; RV64-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
-; RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; RV64-NEXT:    [[TMP6:%.*]] = add <vscale x 2 x i64> [[TMP5]], zeroinitializer
-; RV64-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
-; RV64-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; RV64-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; RV64-NEXT:    [[TMP10:%.*]] = mul i64 16, [[TMP9]]
-; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i64 0
+; RV64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
+; RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; RV64-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[TMP7]], zeroinitializer
+; RV64-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP8]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 16, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP9]]
+; RV64-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; RV64-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; RV64-NEXT:    [[TMP12:%.*]] = mul i64 16, [[TMP11]]
+; RV64-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
 ; RV64-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV64-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; RV64:       vector.body:
 ; RV64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; RV64-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; RV64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope !0
-; RV64-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP13:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; RV64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP13]]
-; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> poison), !alias.scope !3
-; RV64-NEXT:    [[TMP15:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
-; RV64-NEXT:    [[TMP16:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP15]]
-; RV64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV64-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP16]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> [[TMP12]]), !alias.scope !5, !noalias !7
-; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; RV64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[TMP13]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison), !alias.scope [[META0:![0-9]+]]
+; RV64-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 100, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; RV64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
+; RV64-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 8, <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> poison), !alias.scope [[META3:![0-9]+]]
+; RV64-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
+; RV64-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
+; RV64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
+; RV64-NEXT:    call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> [[TMP19]], i32 8, <vscale x 2 x i1> [[TMP14]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; RV64-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; RV64-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 8e9713fecf29d..fc310f4163082 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -54,43 +54,46 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
-; CHECK:       ir-bb<for.body.preheader>:
-; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  vp<%2> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
-; CHECK:       vector.ph:
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
-; CHECK:       <x1> vector loop: {
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:    vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:    vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%arrayidx>
-; CHECK-NEXT:    WIDEN ir<%1> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%1>, ir<1>
-; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3>
-; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT:    EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:    EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
+; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%1> = load vp<%6>
+; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%1>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:      WIDEN store vp<%7>, ir<%add9>
+; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK:       middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
-; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT:  scalar.ph
+; CHECK-NEXT:  scalar.ph:
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
@@ -134,7 +137,52 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK:       LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  vp<%2> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
+; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%13> = load vp<%6>
+; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%13>, ir<1>
+; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:      WIDEN store vp<%7>, ir<%add9>
+; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
+; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ; CHECK-EMPTY:
@@ -193,7 +241,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -210,37 +258,40 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
-; CHECK:       ir-bb<for.body.preheader>:
-; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  vp<%2> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  No successors
-; CHECK:       vector.ph:
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
 ; CHECK-NEXT:  Successor(s): vector loop
-; CHECK:       <x1> vector loop: {
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:    vp<[[DERIVED_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:    vp<[[STEPS]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
-; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%arrayidx>
-; CHECK-NEXT:    WIDEN ir<%1> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
-; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3>
-; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
-; CHECK-NEXT:    EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:    EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
+; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%1> = load vp<%6>
+; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
+; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:      WIDEN store vp<%7>, ir<%conv1>
+; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
-; CHECK:       middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
@@ -255,7 +306,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
-; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
@@ -281,7 +332,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 32
+; CHECK-NEXT:  LV: Loop cost is 34
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -290,7 +341,52 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK:       LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<%0> = VF * UF
+; CHECK-NEXT:  Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT:  vp<%2> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%8>
+; CHECK-NEXT:      vp<%4> = DERIVED-IV ir<%n> + vp<%3> * ir<-1>
+; CHECK-NEXT:      vp<%5> = SCALAR-STEPS vp<%4>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%5>, ir<-1>
+; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT:      vp<%6> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:      WIDEN ir<%13> = load vp<%6>
+; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%13>, ir<1.000000e+00>
+; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT:      vp<%7> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:      WIDEN store vp<%7>, ir<%conv1>
+; CHECK-NEXT:      EMIT vp<%8> = add nuw vp<%3>, vp<%0>
+; CHECK-NEXT:      EMIT branch-on-count vp<%8>, vp<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%2>, vp<%1>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%10>
+; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
index 03fbb5e5a4674..97608174b524d 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
@@ -1130,14 +1130,23 @@ define <vscale x 1 x i64> @umax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 }
 
 define <vscale x 1 x float> @fadd_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; ALL-LABEL: @fadd_nxv1f32_allonesmask(
-; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; VEC-COMBINE-LABEL: @fadd_nxv1f32_allonesmask(
+; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fadd float [[Y:%.*]], 4.200000e+01
+; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
+; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
+;
+; NO-VEC-COMBINE-LABEL: @fadd_nxv1f32_allonesmask(
+; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1164,14 +1173,23 @@ define <vscale x 1 x float> @fadd_nxv1f32_anymask(<vscale x 1 x float> %x, float
 }
 
 define <vscale x 1 x float> @fsub_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; ALL-LABEL: @fsub_nxv1f32_allonesmask(
-; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; VEC-COMBINE-LABEL: @fsub_nxv1f32_allonesmask(
+; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fsub float [[Y:%.*]], 4.200000e+01
+; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
+; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
+;
+; NO-VEC-COMBINE-LABEL: @fsub_nxv1f32_allonesmask(
+; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1198,14 +1216,23 @@ define <vscale x 1 x float> @fsub_nxv1f32_anymask(<vscale x 1 x float> %x, float
 }
 
 define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask(<vscale x 1 x float> %x, float %y, i32 zeroext %evl) {
-; ALL-LABEL: @fdiv_nxv1f32_allonesmask(
-; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
-; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
-; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask(
+; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
+; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
+; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
+;
+; NO-VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask(
+; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
+; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
@@ -1275,14 +1302,23 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 }
 
 define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask_knownvl(<vscale x 1 x float> %x, float %y) {
-; ALL-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
-; ALL-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
-; ALL-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
-; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
-; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
-; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
+; VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
+; VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP1:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
+; VEC-COMBINE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x float> poison, float [[TMP1]], i64 0
+; VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[DOTSPLATINSERT]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP2]], <vscale x 1 x i1> [[MASK]], i32 4)
+; VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP3]]
+;
+; NO-VEC-COMBINE-LABEL: @fdiv_nxv1f32_allonesmask_knownvl(
+; NO-VEC-COMBINE-NEXT:    [[SPLAT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
+; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
+; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
+; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index af431658c9b7d..cc868e7587dc6 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -28,6 +28,7 @@ add_llvm_unittest(OrcJITTests
   LookupAndRecordAddrsTest.cpp
   MachOPlatformTest.cpp
   MapperJITLinkMemoryManagerTest.cpp
+  MemoryFlagsTest.cpp
   MemoryMapperTest.cpp
   ObjectFormatsTest.cpp
   ObjectLinkingLayerTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp
new file mode 100644
index 0000000000000..270ea2f442b9e
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/MemoryFlagsTest.cpp
@@ -0,0 +1,55 @@
+//===------- MemoryFlagsTest.cpp - Test MemoryFlags and related APIs ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h"
+#include "gtest/gtest.h"
+
+#include <future>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+TEST(MemProtTest, Basics) {
+  MemProt MPNone = MemProt::None;
+
+  EXPECT_EQ(MPNone & MemProt::Read, MemProt::None);
+  EXPECT_EQ(MPNone & MemProt::Write, MemProt::None);
+  EXPECT_EQ(MPNone & MemProt::Exec, MemProt::None);
+
+  EXPECT_EQ(MPNone | MemProt::Read, MemProt::Read);
+  EXPECT_EQ(MPNone | MemProt::Write, MemProt::Write);
+  EXPECT_EQ(MPNone | MemProt::Exec, MemProt::Exec);
+
+  MemProt MPAll = MemProt::Read | MemProt::Write | MemProt::Exec;
+  EXPECT_EQ(MPAll & MemProt::Read, MemProt::Read);
+  EXPECT_EQ(MPAll & MemProt::Write, MemProt::Write);
+  EXPECT_EQ(MPAll & MemProt::Exec, MemProt::Exec);
+}
+
+TEST(AllocGroupSmallMap, EmptyMap) {
+  AllocGroupSmallMap<bool> EM;
+  EXPECT_TRUE(EM.empty());
+  EXPECT_EQ(EM.size(), 0u);
+}
+
+TEST(AllocGroupSmallMap, NonEmptyMap) {
+  AllocGroupSmallMap<unsigned> NEM;
+  NEM[MemProt::Read] = 42;
+
+  EXPECT_FALSE(NEM.empty());
+  EXPECT_EQ(NEM.size(), 1U);
+  EXPECT_EQ(NEM[MemProt::Read], 42U);
+  EXPECT_EQ(NEM.find(MemProt::Read), NEM.begin());
+  EXPECT_EQ(NEM.find(MemProt::Read | MemProt::Write), NEM.end());
+
+  NEM[MemProt::Read | MemProt::Write] = 7;
+  EXPECT_EQ(NEM.size(), 2U);
+  EXPECT_EQ(NEM.begin()->second, 42U);
+  EXPECT_EQ((NEM.begin() + 1)->second, 7U);
+}
+
diff --git a/llvm/unittests/Support/raw_socket_stream_test.cpp b/llvm/unittests/Support/raw_socket_stream_test.cpp
index c4e8cfbbe7e6a..348fb4bb3e089 100644
--- a/llvm/unittests/Support/raw_socket_stream_test.cpp
+++ b/llvm/unittests/Support/raw_socket_stream_test.cpp
@@ -62,17 +62,50 @@ TEST(raw_socket_streamTest, CLIENT_TO_SERVER_AND_SERVER_TO_CLIENT) {
   ssize_t BytesRead = Server.read(Bytes, 8);
 
   std::string string(Bytes, 8);
+  ASSERT_EQ(Server.has_error(), false);
 
   ASSERT_EQ(8, BytesRead);
   ASSERT_EQ("01234567", string);
 }
 
-TEST(raw_socket_streamTest, TIMEOUT_PROVIDED) {
+TEST(raw_socket_streamTest, READ_WITH_TIMEOUT) {
   if (!hasUnixSocketSupport())
     GTEST_SKIP();
 
   SmallString<100> SocketPath;
-  llvm::sys::fs::createUniquePath("timout_provided.sock", SocketPath, true);
+  llvm::sys::fs::createUniquePath("read_with_timeout.sock", SocketPath, true);
+
+  // Make sure socket file does not exist. May still be there from the last test
+  std::remove(SocketPath.c_str());
+
+  Expected<ListeningSocket> MaybeServerListener =
+      ListeningSocket::createUnix(SocketPath);
+  ASSERT_THAT_EXPECTED(MaybeServerListener, llvm::Succeeded());
+  ListeningSocket ServerListener = std::move(*MaybeServerListener);
+
+  Expected<std::unique_ptr<raw_socket_stream>> MaybeClient =
+      raw_socket_stream::createConnectedUnix(SocketPath);
+  ASSERT_THAT_EXPECTED(MaybeClient, llvm::Succeeded());
+
+  Expected<std::unique_ptr<raw_socket_stream>> MaybeServer =
+      ServerListener.accept();
+  ASSERT_THAT_EXPECTED(MaybeServer, llvm::Succeeded());
+  raw_socket_stream &Server = **MaybeServer;
+
+  char Bytes[8];
+  ssize_t BytesRead = Server.read(Bytes, 8, std::chrono::milliseconds(100));
+  ASSERT_EQ(BytesRead, -1);
+  ASSERT_EQ(Server.has_error(), true);
+  ASSERT_EQ(Server.error(), std::errc::timed_out);
+  Server.clear_error();
+}
+
+TEST(raw_socket_streamTest, ACCEPT_WITH_TIMEOUT) {
+  if (!hasUnixSocketSupport())
+    GTEST_SKIP();
+
+  SmallString<100> SocketPath;
+  llvm::sys::fs::createUniquePath("accept_with_timeout.sock", SocketPath, true);
 
   // Make sure socket file does not exist. May still be there from the last test
   std::remove(SocketPath.c_str());
@@ -82,19 +115,19 @@ TEST(raw_socket_streamTest, TIMEOUT_PROVIDED) {
   ASSERT_THAT_EXPECTED(MaybeServerListener, llvm::Succeeded());
   ListeningSocket ServerListener = std::move(*MaybeServerListener);
 
-  std::chrono::milliseconds Timeout = std::chrono::milliseconds(100);
   Expected<std::unique_ptr<raw_socket_stream>> MaybeServer =
-      ServerListener.accept(Timeout);
+      ServerListener.accept(std::chrono::milliseconds(100));
   ASSERT_EQ(llvm::errorToErrorCode(MaybeServer.takeError()),
             std::errc::timed_out);
 }
 
-TEST(raw_socket_streamTest, FILE_DESCRIPTOR_CLOSED) {
+TEST(raw_socket_streamTest, ACCEPT_WITH_SHUTDOWN) {
   if (!hasUnixSocketSupport())
     GTEST_SKIP();
 
   SmallString<100> SocketPath;
-  llvm::sys::fs::createUniquePath("fd_closed.sock", SocketPath, true);
+  llvm::sys::fs::createUniquePath("accept_with_shutdown.sock", SocketPath,
+                                  true);
 
   // Make sure socket file does not exist. May still be there from the last test
   std::remove(SocketPath.c_str());
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 04e9e0fa48db0..910488a14b985 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/RISCVISAUtils.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -209,10 +210,46 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
   OS << "\n#undef TUNE_PROC\n";
 }
 
+static void emitRISCVExtensionBitmask(RecordKeeper &RK, raw_ostream &OS) {
+
+  std::vector<Record *> Extensions =
+      RK.getAllDerivedDefinitionsIfDefined("RISCVExtensionBitmask");
+  llvm::sort(Extensions, [](const Record *Rec1, const Record *Rec2) {
+    return getExtensionName(Rec1) < getExtensionName(Rec2);
+  });
+
+#ifndef NDEBUG
+  llvm::DenseSet<std::pair<uint64_t, uint64_t>> Seen;
+#endif
+
+  OS << "#ifdef GET_RISCVExtensionBitmaskTable_IMPL\n";
+  OS << "static const RISCVExtensionBitmask ExtensionBitmask[]={\n";
+  for (const Record *Rec : Extensions) {
+    unsigned GroupIDVal = Rec->getValueAsInt("GroupID");
+    unsigned BitPosVal = Rec->getValueAsInt("BitPos");
+
+    StringRef ExtName = Rec->getValueAsString("Name");
+    ExtName.consume_front("experimental-");
+
+#ifndef NDEBUG
+    assert(Seen.insert(std::make_pair(GroupIDVal, BitPosVal)).second &&
+           "duplicated bitmask");
+#endif
+
+    OS << "    {"
+       << "\"" << ExtName << "\""
+       << ", " << GroupIDVal << ", " << BitPosVal << "ULL"
+       << "},\n";
+  }
+  OS << "};\n";
+  OS << "#endif\n";
+}
+
 static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
   emitRISCVExtensions(RK, OS);
   emitRISCVProfiles(RK, OS);
   emitRISCVProcs(RK, OS);
+  emitRISCVExtensionBitmask(RK, OS);
 }
 
 static TableGen::Emitter::Opt X("gen-riscv-target-def", EmitRISCVTargetDef,
diff --git a/mlir/docs/Canonicalization.md b/mlir/docs/Canonicalization.md
index d1cba572af212..03fd174229afe 100644
--- a/mlir/docs/Canonicalization.md
+++ b/mlir/docs/Canonicalization.md
@@ -33,6 +33,10 @@ together.
 
 Some important things to think about w.r.t. canonicalization patterns:
 
+*   The goal of canonicalization is to make subsequent analyses and
+    optimizations more effective. Therefore, performance improvements are not
+    necessary for canonicalization.
+
 *   Pass pipelines should not rely on the canonicalizer pass for correctness.
     They should work correctly with all instances of the canonicalization pass
     removed.
@@ -51,6 +55,39 @@ Some important things to think about w.r.t. canonicalization patterns:
 *   It is always good to eliminate operations entirely when possible, e.g. by
     folding known identities (like "x + 0 = x").
 
+*   Pattens with expensive running time (i.e. have O(n) complexity) or
+    complicated cost models don't belong to canonicalization: since the
+    algorithm is executed iteratively until fixed-point we want patterns that
+    execute quickly (in particular their matching phase).
+
+*   Canonicalize shouldn't lose the semantic of original operation: the original
+    information should always be recoverable from the transformed IR.
+
+For example, a pattern that transform
+
+```
+  %transpose = linalg.transpose
+      ins(%input : tensor<1x2x3xf32>)
+      outs(%init1 : tensor<2x1x3xf32>)
+      dimensions = [1, 0, 2]
+  %out = linalg.transpose
+      ins(%tranpose: tensor<2x1x3xf32>)
+      outs(%init2 : tensor<3x1x2xf32>)
+      permutation = [2, 1, 0]
+```
+
+to
+
+```
+  %out= linalg.transpose
+      ins(%input : tensor<1x2x3xf32>)
+      outs(%init2: tensor<3x1x2xf32>)
+      permutation = [2, 0, 1]
+```
+
+is a good canonicalization pattern because it removes a redundant operation,
+making other analysis optimizations and more efficient.
+
 ## Globally Applied Rules
 
 These transformations are applied to all levels of IR:
@@ -189,7 +226,7 @@ each of the operands, returning the corresponding constant attribute. These
 operands are those that implement the `ConstantLike` trait. If any of the
 operands are non-constant, a null `Attribute` value is provided instead. For
 example, if MyOp provides three operands [`a`, `b`, `c`], but only `b` is
-constant then `adaptor` will return Attribute() for `getA()` and `getC()`, 
+constant then `adaptor` will return Attribute() for `getA()` and `getC()`,
 and b-value for `getB()`.
 
 Also above, is the use of `OpFoldResult`. This class represents the possible
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 0b552a7e1ca3b..a045868b66031 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -293,10 +293,9 @@ class CreateBlockRewrite : public BlockRewrite {
 /// original location.
 class EraseBlockRewrite : public BlockRewrite {
 public:
-  EraseBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block,
-                    Region *region, Block *insertBeforeBlock)
-      : BlockRewrite(Kind::EraseBlock, rewriterImpl, block), region(region),
-        insertBeforeBlock(insertBeforeBlock) {}
+  EraseBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block)
+      : BlockRewrite(Kind::EraseBlock, rewriterImpl, block),
+        region(block->getParent()), insertBeforeBlock(block->getNextNode()) {}
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() == Kind::EraseBlock;
@@ -1440,9 +1439,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
 }
 
 void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
-  Region *region = block->getParent();
-  Block *origNextBlock = block->getNextNode();
-  appendRewrite<EraseBlockRewrite>(block, region, origNextBlock);
+  appendRewrite<EraseBlockRewrite>(block);
 }
 
 void ConversionPatternRewriterImpl::notifyBlockInserted(

From 0274f697376264c2d77816190f9a434f64e79089 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Mon, 22 Jul 2024 11:56:23 -0700
Subject: [PATCH 02/42] Changed assignment of profiles with pseudo probe index

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp     | 85 +++++++++++++++----
 .../X86/match-blocks-with-pseudo-probes.test  | 25 ++----
 2 files changed, 78 insertions(+), 32 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 4105f626fb5b6..c135ee5ff4837 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -195,11 +195,15 @@ class StaleMatcher {
   void init(const std::vector<FlowBlock *> &Blocks,
             const std::vector<BlendedBlockHash> &Hashes,
             const std::vector<uint64_t> &CallHashes,
-            std::optional<uint64_t> YamlBFGUID) {
+            const std::unordered_map<uint64_t,
+                                     std::vector<const MCDecodedPseudoProbe *>>
+                IndexToBinaryPseudoProbes,
+            const std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
+                BinaryPseudoProbeToBlock,
+            const uint64_t YamlBFGUID) {
     assert(Blocks.size() == Hashes.size() &&
            Hashes.size() == CallHashes.size() &&
            "incorrect matcher initialization");
-
     for (size_t I = 0; I < Blocks.size(); I++) {
       FlowBlock *Block = Blocks[I];
       uint16_t OpHash = Hashes[I].OpcodeHash;
@@ -209,6 +213,8 @@ class StaleMatcher {
             std::make_pair(Hashes[I], Block));
       this->Blocks.push_back(Block);
     }
+    this->IndexToBinaryPseudoProbes = IndexToBinaryPseudoProbes;
+    this->BinaryPseudoProbeToBlock = BinaryPseudoProbeToBlock;
     this->YamlBFGUID = YamlBFGUID;
   }
 
@@ -234,10 +240,14 @@ class StaleMatcher {
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
-  std::vector<FlowBlock *> Blocks;
+  std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
+      IndexToBinaryPseudoProbes;
+  std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
+      BinaryPseudoProbeToBlock;
+  std::vector<const FlowBlock *> Blocks;
   // If the pseudo probe checksums of the profiled and binary functions are
   // equal, then the YamlBF's GUID is defined and used to match blocks.
-  std::optional<uint64_t> YamlBFGUID;
+  uint64_t YamlBFGUID;
 
   // Uses OpcodeHash to find the most similar block for a given hash.
   const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
@@ -284,7 +294,7 @@ class StaleMatcher {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
-    outs() << "match with pseudo probes\n";
+    std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
     for (const auto &PseudoProbe : PseudoProbes) {
       // Ensures that pseudo probe information belongs to the appropriate
       // function and not an inlined function.
@@ -293,11 +303,30 @@ class StaleMatcher {
       // Skips pseudo probes attached to function calls.
       if (PseudoProbe.Type != static_cast<uint8_t>(PseudoProbeType::Block))
         continue;
-      assert(PseudoProbe.Index < Blocks.size() &&
-             "pseudo probe index out of range");
-      return Blocks[PseudoProbe.Index];
+
+      BlockPseudoProbes.push_back(&PseudoProbe);
     }
-    return nullptr;
+
+    // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
+    // probe and binary pseudo probe.
+    if (BlockPseudoProbes.size() == 0 || BlockPseudoProbes.size() > 1)
+      return nullptr;
+
+    uint64_t Index = BlockPseudoProbes[0]->Index;
+    assert(Index < Blocks.size() && "Invalid pseudo probe index");
+
+    auto It = IndexToBinaryPseudoProbes.find(Index);
+    assert(It != IndexToBinaryPseudoProbes.end() &&
+           "All blocks should have a pseudo probe");
+    if (It->second.size() > 1)
+      return nullptr;
+
+    const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
+    auto BinaryPseudoProbeIt = BinaryPseudoProbeToBlock.find(BinaryPseudoProbe);
+    assert(BinaryPseudoProbeIt != BinaryPseudoProbeToBlock.end() &&
+           "All binary pseudo probes should belong a binary basic block");
+
+    return BinaryPseudoProbeIt->second;
   }
 };
 
@@ -491,6 +520,11 @@ size_t matchWeightsByHashes(
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
+  std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
+      IndexToBinaryPseudoProbes;
+  std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
+      BinaryPseudoProbeToBlock;
+  const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     const BinaryBasicBlock *BB = BlockOrder[I];
     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
@@ -510,9 +544,27 @@ size_t matchWeightsByHashes(
     Blocks.push_back(&Func.Blocks[I + 1]);
     BlendedBlockHash BlendedHash(BB->getHash());
     BlendedHashes.push_back(BlendedHash);
+    if (PseudoProbeDecoder) {
+      const AddressProbesMap &ProbeMap =
+          PseudoProbeDecoder->getAddress2ProbesMap();
+      const uint64_t FuncAddr = BF.getAddress();
+      const std::pair<uint64_t, uint64_t> &BlockRange =
+          BB->getInputAddressRange();
+      const auto &BlockProbes =
+          llvm::make_range(ProbeMap.lower_bound(FuncAddr + BlockRange.first),
+                           ProbeMap.lower_bound(FuncAddr + BlockRange.second));
+      for (const auto &[_, Probes] : BlockProbes) {
+        for (const MCDecodedPseudoProbe &Probe : Probes) {
+          IndexToBinaryPseudoProbes[Probe.getIndex()].push_back(&Probe);
+          BinaryPseudoProbeToBlock[&Probe] = Blocks[I];
+        }
+      }
+    }
+
     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
+
   uint64_t BFPseudoProbeDescHash = 0;
   if (BF.hasPseudoProbe()) {
     const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
@@ -521,14 +573,15 @@ size_t matchWeightsByHashes(
     BFPseudoProbeDescHash =
         PseudoProbeDecoder->getFuncDescForGUID(BF.getGUID())->FuncHash;
   }
-  bool MatchWithPseudoProbes =
-      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash
-          ? BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
-          : false;
+  uint64_t YamlBFGUID =
+      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&
+              BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
+          ? static_cast<uint64_t>(YamlBF.GUID)
+          : 0;
+
   StaleMatcher Matcher;
-  Matcher.init(Blocks, BlendedHashes, CallHashes,
-               MatchWithPseudoProbes ? std::make_optional(YamlBF.GUID)
-                                     : std::nullopt);
+  Matcher.init(Blocks, BlendedHashes, CallHashes, IndexToBinaryPseudoProbes,
+               BinaryPseudoProbeToBlock, YamlBFGUID);
 
   // Index in yaml profile => corresponding (matched) block
   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index e0adb6948e206..1d74b92a11c56 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
 # RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
-# RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 2>&1 | FileCheck %s
+# RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 --infer-stale-profile 2>&1 | FileCheck %s
 
 # CHECK: BOLT-INFO: matched 0 functions with similar names
 
@@ -47,23 +47,16 @@ header:
   dfs-order:       false
   hash-func:       xxh3
 functions:
-  - name:            main
-    fid:             0
-    hash:            0x0000000000000001
-    exec:            1
-    nblocks:         6
+  - name:                   main
+    fid:                    0
+    hash:                   0x0000000000000001
+    exec:                   1
+    nblocks:                6
+    guid:                   0xDB956436E78DD5FA
+    pseudo_probe_desc_hash: 15822663052811949562    #lookup in code in a second
     blocks:
       - bid:             1
         hash:            0x0000000000000001
         insns:           1
         succ:            [ { bid: 3, cnt: 1} ]
-  - name:            foo
-    fid:             1
-    hash:            0x0000000000000002
-    exec:            1
-    nblocks:         6
-    blocks:
-      - bid:             1
-        hash:            0x0000000000000002
-        insns:           1
-        succ:            [ { bid: 3, cnt: 1} ]
+        pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 0, type: 0 } ]

From 7e3d8d6b171954836c858f0814befc54f70bd3aa Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Mon, 22 Jul 2024 14:27:44 -0700
Subject: [PATCH 03/42] Edit test and assert

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp          | 2 +-
 bolt/test/X86/match-blocks-with-pseudo-probes.test | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index c135ee5ff4837..71e0579415fc6 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -313,7 +313,7 @@ class StaleMatcher {
       return nullptr;
 
     uint64_t Index = BlockPseudoProbes[0]->Index;
-    assert(Index < Blocks.size() && "Invalid pseudo probe index");
+    assert(Index <= Blocks.size() && "Invalid pseudo probe index");
 
     auto It = IndexToBinaryPseudoProbes.find(Index);
     assert(It != IndexToBinaryPseudoProbes.end() &&
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 1d74b92a11c56..6dc01eb492eae 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -53,10 +53,10 @@ functions:
     exec:                   1
     nblocks:                6
     guid:                   0xDB956436E78DD5FA
-    pseudo_probe_desc_hash: 15822663052811949562    #lookup in code in a second
+    pseudo_probe_desc_hash: 4294967295    #lookup in code in a second
     blocks:
       - bid:             1
         hash:            0x0000000000000001
         insns:           1
         succ:            [ { bid: 3, cnt: 1} ]
-        pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 0, type: 0 } ]
+        pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 1, type: 0 } ]

From 780a07ee5a4b2bc3f5bd6e33fb072d67d1113c89 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 11:37:14 -0700
Subject: [PATCH 04/42] Fixed failing asserts, pruned prospective pseudo probes
 for matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 56 ++++++++++++++++-------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 71e0579415fc6..d45066ed66ef2 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -45,6 +45,7 @@ namespace opts {
 
 extern cl::opt<bool> TimeRewrite;
 extern cl::OptionCategory BoltOptCategory;
+extern cl::opt<unsigned> Verbosity;
 
 cl::opt<bool>
     InferStaleProfile("infer-stale-profile",
@@ -197,9 +198,9 @@ class StaleMatcher {
             const std::vector<uint64_t> &CallHashes,
             const std::unordered_map<uint64_t,
                                      std::vector<const MCDecodedPseudoProbe *>>
-                IndexToBinaryPseudoProbes,
+                &IndexToBinaryPseudoProbes,
             const std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-                BinaryPseudoProbeToBlock,
+                &BinaryPseudoProbeToBlock,
             const uint64_t YamlBFGUID) {
     assert(Blocks.size() == Hashes.size() &&
            Hashes.size() == CallHashes.size() &&
@@ -294,6 +295,9 @@ class StaleMatcher {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
+    if (opts::Verbosity >= 2)
+      outs() << "BOLT-INFO: attempting to match block with pseudo probes\n";
+
     std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
     for (const auto &PseudoProbe : PseudoProbes) {
       // Ensures that pseudo probe information belongs to the appropriate
@@ -306,26 +310,41 @@ class StaleMatcher {
 
       BlockPseudoProbes.push_back(&PseudoProbe);
     }
-
     // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
     // probe and binary pseudo probe.
-    if (BlockPseudoProbes.size() == 0 || BlockPseudoProbes.size() > 1)
+    if (BlockPseudoProbes.size() == 0) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: no pseudo probes in profile block\n";
       return nullptr;
-
+    }
+    if (BlockPseudoProbes.size() > 1) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: more than 1 pseudo probes in profile block\n";
+      return nullptr;
+    }
     uint64_t Index = BlockPseudoProbes[0]->Index;
-    assert(Index <= Blocks.size() && "Invalid pseudo probe index");
-
+    if (Index > Blocks.size()) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: invalid index block pseudo probe index\n";
+      return nullptr;
+    }
     auto It = IndexToBinaryPseudoProbes.find(Index);
-    assert(It != IndexToBinaryPseudoProbes.end() &&
-           "All blocks should have a pseudo probe");
-    if (It->second.size() > 1)
+    if (It == IndexToBinaryPseudoProbes.end()) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: no block pseudo probes found within binary "
+                  "block at index\n";
       return nullptr;
-
+    }
+    if (It->second.size() > 1) {
+      if (opts::Verbosity >= 2)
+        errs() << "BOLT-WARNING: more than 1 block pseudo probes in binary "
+                  "block at index\n";
+      return nullptr;
+    }
     const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
     auto BinaryPseudoProbeIt = BinaryPseudoProbeToBlock.find(BinaryPseudoProbe);
     assert(BinaryPseudoProbeIt != BinaryPseudoProbeToBlock.end() &&
            "All binary pseudo probes should belong a binary basic block");
-
     return BinaryPseudoProbeIt->second;
   }
 };
@@ -555,6 +574,10 @@ size_t matchWeightsByHashes(
                            ProbeMap.lower_bound(FuncAddr + BlockRange.second));
       for (const auto &[_, Probes] : BlockProbes) {
         for (const MCDecodedPseudoProbe &Probe : Probes) {
+          if (Probe.getInlineTreeNode()->hasInlineSite())
+            continue;
+          if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
+            continue;
           IndexToBinaryPseudoProbes[Probe.getIndex()].push_back(&Probe);
           BinaryPseudoProbeToBlock[&Probe] = Blocks[I];
         }
@@ -566,12 +589,13 @@ size_t matchWeightsByHashes(
   }
 
   uint64_t BFPseudoProbeDescHash = 0;
-  if (BF.hasPseudoProbe()) {
-    const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
+  if (BF.getGUID() != 0) {
     assert(PseudoProbeDecoder &&
            "If BF has pseudo probe, BC should have a pseudo probe decoder");
-    BFPseudoProbeDescHash =
-        PseudoProbeDecoder->getFuncDescForGUID(BF.getGUID())->FuncHash;
+    auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
+    auto It = GUID2FuncDescMap.find(BF.getGUID());
+    if (It != GUID2FuncDescMap.end())
+      BFPseudoProbeDescHash = It->second.FuncHash;
   }
   uint64_t YamlBFGUID =
       BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&

From 1638ac1dacec63d9099ae3c19f2fee7c0797ed71 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 14:24:02 -0700
Subject: [PATCH 05/42] Added logging for pseudo probe block matching

Created using spr 1.3.4
---
 bolt/include/bolt/Core/BinaryContext.h    | 12 ++++++---
 bolt/lib/Passes/BinaryPasses.cpp          | 18 +++++++++++---
 bolt/lib/Profile/StaleProfileMatching.cpp | 30 +++++++++++++++++------
 3 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index b3cf9f834cc08..39f2ac512d305 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -717,12 +717,16 @@ class BinaryContext {
     /// Stats for stale profile matching:
     ///   the total number of basic blocks in the profile
     uint32_t NumStaleBlocks{0};
-    ///   the number of matched basic blocks
-    uint32_t NumMatchedBlocks{0};
+    ///   the number of exactly matched basic blocks
+    uint32_t NumExactMatchedBlocks{0};
+    ///   the number of pseudo probe matched basic blocks
+    uint32_t NumPseudoProbeMatchedBlocks{0};
     ///   the total count of samples in the profile
     uint64_t StaleSampleCount{0};
-    ///   the count of matched samples
-    uint64_t MatchedSampleCount{0};
+    ///   the count of exactly matched samples
+    uint64_t ExactMatchedSampleCount{0};
+    ///   the count of pseudo probe matched samples
+    uint64_t PseudoProbeMatchedSampleCount{0};
     ///   the number of stale functions that have matching number of blocks in
     ///   the profile
     uint64_t NumStaleFuncsWithEqualBlockCount{0};
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index fa95ad7324ac1..b786f07a6a665 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1519,10 +1519,20 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
         " (%zu out of %zu stale) responsible for %.2f%% samples"
         " (%zu out of %zu stale)\n",
-        100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
-        BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
-        100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
-        BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
+        100.0 * BC.Stats.NumExactMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumExactMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount,
+        BC.Stats.ExactMatchedSampleCount, BC.Stats.StaleSampleCount);
+    BC.outs() << format(
+        "BOLT-INFO: inference found a pseudo probe match for %.2f%% of basic "
+        "blocks"
+        " (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumPseudoProbeMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumPseudoProbeMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.PseudoProbeMatchedSampleCount /
+            BC.Stats.StaleSampleCount,
+        BC.Stats.PseudoProbeMatchedSampleCount, BC.Stats.StaleSampleCount);
   }
 
   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index d45066ed66ef2..919f3a732b355 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -220,13 +220,14 @@ class StaleMatcher {
   }
 
   /// Find the most similar block for a given hash.
-  const FlowBlock *matchBlock(
-      BlendedBlockHash BlendedHash, uint64_t CallHash,
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
+  const FlowBlock *
+  matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
+             const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
     const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
     BestBlock = BestBlock ? BestBlock : matchWithCalls(BlendedHash, CallHash);
-    return BestBlock || !YamlBFGUID ? BestBlock
-                                    : matchWithPseudoProbes(PseudoProbes);
+    return BestBlock || !YamlBFGUID
+               ? BestBlock
+               : matchWithPseudoProbes(BlendedHash, PseudoProbes);
   }
 
   /// Returns true if the two basic blocks (in the binary and in the profile)
@@ -237,6 +238,11 @@ class StaleMatcher {
     return Hash1.InstrHash == Hash2.InstrHash;
   }
 
+  bool isPseudoProbeMatch(BlendedBlockHash YamlBBHash) {
+    return MatchedWithPseudoProbes.find(YamlBBHash.combine()) !=
+           MatchedWithPseudoProbes.end();
+  }
+
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
@@ -245,6 +251,7 @@ class StaleMatcher {
       IndexToBinaryPseudoProbes;
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
       BinaryPseudoProbeToBlock;
+  std::unordered_set<uint64_t> MatchedWithPseudoProbes;
   std::vector<const FlowBlock *> Blocks;
   // If the pseudo probe checksums of the profiled and binary functions are
   // equal, then the YamlBF's GUID is defined and used to match blocks.
@@ -291,7 +298,8 @@ class StaleMatcher {
   // Uses pseudo probe information to attach the profile to the appropriate
   // block.
   const FlowBlock *matchWithPseudoProbes(
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
+      BlendedBlockHash BlendedHash,
+      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
@@ -345,6 +353,8 @@ class StaleMatcher {
     auto BinaryPseudoProbeIt = BinaryPseudoProbeToBlock.find(BinaryPseudoProbe);
     assert(BinaryPseudoProbeIt != BinaryPseudoProbeToBlock.end() &&
            "All binary pseudo probes should belong a binary basic block");
+
+    MatchedWithPseudoProbes.insert(BlendedHash.combine());
     return BinaryPseudoProbeIt->second;
   }
 };
@@ -639,9 +649,13 @@ size_t matchWeightsByHashes(
                         << "\n");
       // Update matching stats accounting for the matched block.
       if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
-        ++BC.Stats.NumMatchedBlocks;
-        BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
+        ++BC.Stats.NumExactMatchedBlocks;
+        BC.Stats.ExactMatchedSampleCount += YamlBB.ExecCount;
         LLVM_DEBUG(dbgs() << "  exact match\n");
+      } else if (Matcher.isPseudoProbeMatch(YamlHash)) {
+        ++BC.Stats.NumPseudoProbeMatchedBlocks;
+        BC.Stats.PseudoProbeMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  pseudo probe match\n");
       } else {
         LLVM_DEBUG(dbgs() << "  loose match\n");
       }

From 144716be84d2207ee98fb238b88c6495942dec21 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 15:41:31 -0700
Subject: [PATCH 06/42] Changed pseudo probe matching failure logging to v=3

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 919f3a732b355..2d1a73bd60e8f 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -255,7 +255,7 @@ class StaleMatcher {
   std::vector<const FlowBlock *> Blocks;
   // If the pseudo probe checksums of the profiled and binary functions are
   // equal, then the YamlBF's GUID is defined and used to match blocks.
-  uint64_t YamlBFGUID;
+  uint64_t YamlBFGUID{0};
 
   // Uses OpcodeHash to find the most similar block for a given hash.
   const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
@@ -321,30 +321,30 @@ class StaleMatcher {
     // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
     // probe and binary pseudo probe.
     if (BlockPseudoProbes.size() == 0) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: no pseudo probes in profile block\n";
       return nullptr;
     }
     if (BlockPseudoProbes.size() > 1) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: more than 1 pseudo probes in profile block\n";
       return nullptr;
     }
     uint64_t Index = BlockPseudoProbes[0]->Index;
     if (Index > Blocks.size()) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: invalid index block pseudo probe index\n";
       return nullptr;
     }
     auto It = IndexToBinaryPseudoProbes.find(Index);
     if (It == IndexToBinaryPseudoProbes.end()) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: no block pseudo probes found within binary "
                   "block at index\n";
       return nullptr;
     }
     if (It->second.size() > 1) {
-      if (opts::Verbosity >= 2)
+      if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: more than 1 block pseudo probes in binary "
                   "block at index\n";
       return nullptr;

From 29347109ada65c82fef3aa0803b18c413d9c4e6b Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 15:48:14 -0700
Subject: [PATCH 07/42] More loggin

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 2d1a73bd60e8f..3762d91ea9489 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -238,11 +238,17 @@ class StaleMatcher {
     return Hash1.InstrHash == Hash2.InstrHash;
   }
 
+  /// Returns true if a profiled block was matched with its pseudo probe.
   bool isPseudoProbeMatch(BlendedBlockHash YamlBBHash) {
     return MatchedWithPseudoProbes.find(YamlBBHash.combine()) !=
            MatchedWithPseudoProbes.end();
   }
 
+  /// Returns the number of blocks matched with pseudo probes.
+  size_t getNumBlocksMatchedWithPseudoProbes() const {
+    return MatchedWithPseudoProbes.size();
+  } 
+
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
@@ -303,7 +309,7 @@ class StaleMatcher {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
-    if (opts::Verbosity >= 2)
+    if (opts::Verbosity >= 3)
       outs() << "BOLT-INFO: attempting to match block with pseudo probes\n";
 
     std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
@@ -672,6 +678,11 @@ size_t matchWeightsByHashes(
     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
   }
 
+  if (opts::Verbosity >= 2)
+    outs() << "BOLT-INFO: " 
+      << StaleMatcher.getNumBlocksMatchedWithPseudoProbes()
+      << " blocks matched with pseudo probes\n";
+
   // Match jumps from the profile to the jumps from CFG
   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
   std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);

From b74fc8b2f200b776dcf0e51d505e4e43267ef938 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 16:03:21 -0700
Subject: [PATCH 08/42] Logging blocks matched with opcodes

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 3762d91ea9489..b31bddd47edf9 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -224,6 +224,8 @@ class StaleMatcher {
   matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
              const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
     const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
+    if (BestBlock)
+      ++MatchedWithOpcodes;
     BestBlock = BestBlock ? BestBlock : matchWithCalls(BlendedHash, CallHash);
     return BestBlock || !YamlBFGUID
                ? BestBlock
@@ -247,7 +249,10 @@ class StaleMatcher {
   /// Returns the number of blocks matched with pseudo probes.
   size_t getNumBlocksMatchedWithPseudoProbes() const {
     return MatchedWithPseudoProbes.size();
-  } 
+  }
+
+  /// Returns the number of blocks matched with opcodes.
+  size_t getNumBlocksMatchedWithOpcodes() const { return MatchedWithOpcodes; }
 
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
@@ -259,9 +264,8 @@ class StaleMatcher {
       BinaryPseudoProbeToBlock;
   std::unordered_set<uint64_t> MatchedWithPseudoProbes;
   std::vector<const FlowBlock *> Blocks;
-  // If the pseudo probe checksums of the profiled and binary functions are
-  // equal, then the YamlBF's GUID is defined and used to match blocks.
   uint64_t YamlBFGUID{0};
+  uint64_t MatchedWithOpcodes{0};
 
   // Uses OpcodeHash to find the most similar block for a given hash.
   const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
@@ -678,10 +682,13 @@ size_t matchWeightsByHashes(
     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
   }
 
-  if (opts::Verbosity >= 2)
-    outs() << "BOLT-INFO: " 
-      << StaleMatcher.getNumBlocksMatchedWithPseudoProbes()
-      << " blocks matched with pseudo probes\n";
+  if (opts::Verbosity >= 2) {
+    outs() << "BOLT-INFO: "
+           << StaleMatcher.getNumBlocksMatchedWithPseudoProbes()
+           << " blocks matched with pseudo probes\n"
+           << "BOLT-INFO: " << StaleMatcher.getNumBlocksMatchedWithOpcodes()
+           << " blocks matched with opcodes\n";
+  }
 
   // Match jumps from the profile to the jumps from CFG
   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);

From c38fb98fb287d881ce8162fde0522d60b43da56f Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 16:10:09 -0700
Subject: [PATCH 09/42] Updated test

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp          | 4 ++--
 bolt/test/X86/match-blocks-with-pseudo-probes.test | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index b31bddd47edf9..c621c29a0db83 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -684,9 +684,9 @@ size_t matchWeightsByHashes(
 
   if (opts::Verbosity >= 2) {
     outs() << "BOLT-INFO: "
-           << StaleMatcher.getNumBlocksMatchedWithPseudoProbes()
+           << Matcher.getNumBlocksMatchedWithPseudoProbes()
            << " blocks matched with pseudo probes\n"
-           << "BOLT-INFO: " << StaleMatcher.getNumBlocksMatchedWithOpcodes()
+           << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithOpcodes()
            << " blocks matched with opcodes\n";
   }
 
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 6dc01eb492eae..83f9c20f31ba6 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -7,7 +7,7 @@
 # RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
 # RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 --infer-stale-profile 2>&1 | FileCheck %s
 
-# CHECK: BOLT-INFO: matched 0 functions with similar names
+# CHECK: BOLT-INFO: inference found a pseudo probe match for 100.00% of basic blocks (1 out of 1 stale) responsible for -nan% samples (0 out of 0 stale)
 
 #--- main.s
  .text

From b2a3ca7fd532828ae7320da6f888f20a1717bb92 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 16:14:34 -0700
Subject: [PATCH 10/42] Name changes in prep for inlined block pseudo probe
 block matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 35 +++++++++++------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index c621c29a0db83..4410fddaf0b21 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -198,9 +198,9 @@ class StaleMatcher {
             const std::vector<uint64_t> &CallHashes,
             const std::unordered_map<uint64_t,
                                      std::vector<const MCDecodedPseudoProbe *>>
-                &IndexToBinaryPseudoProbes,
+                &IndexToBBPseudoProbes,
             const std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-                &BinaryPseudoProbeToBlock,
+                &BBPseudoProbeToBlock,
             const uint64_t YamlBFGUID) {
     assert(Blocks.size() == Hashes.size() &&
            Hashes.size() == CallHashes.size() &&
@@ -214,8 +214,8 @@ class StaleMatcher {
             std::make_pair(Hashes[I], Block));
       this->Blocks.push_back(Block);
     }
-    this->IndexToBinaryPseudoProbes = IndexToBinaryPseudoProbes;
-    this->BinaryPseudoProbeToBlock = BinaryPseudoProbeToBlock;
+    this->IndexToBBPseudoProbes = IndexToBBPseudoProbes;
+    this->BBPseudoProbeToBlock = BBPseudoProbeToBlock;
     this->YamlBFGUID = YamlBFGUID;
   }
 
@@ -259,9 +259,9 @@ class StaleMatcher {
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
   std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
-      IndexToBinaryPseudoProbes;
+      IndexToBBPseudoProbes;
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-      BinaryPseudoProbeToBlock;
+      BBPseudoProbeToBlock;
   std::unordered_set<uint64_t> MatchedWithPseudoProbes;
   std::vector<const FlowBlock *> Blocks;
   uint64_t YamlBFGUID{0};
@@ -346,8 +346,8 @@ class StaleMatcher {
         errs() << "BOLT-WARNING: invalid index block pseudo probe index\n";
       return nullptr;
     }
-    auto It = IndexToBinaryPseudoProbes.find(Index);
-    if (It == IndexToBinaryPseudoProbes.end()) {
+    auto It = IndexToBBPseudoProbes.find(Index);
+    if (It == IndexToBBPseudoProbes.end()) {
       if (opts::Verbosity >= 3)
         errs() << "BOLT-WARNING: no block pseudo probes found within binary "
                   "block at index\n";
@@ -360,8 +360,8 @@ class StaleMatcher {
       return nullptr;
     }
     const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
-    auto BinaryPseudoProbeIt = BinaryPseudoProbeToBlock.find(BinaryPseudoProbe);
-    assert(BinaryPseudoProbeIt != BinaryPseudoProbeToBlock.end() &&
+    auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
+    assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
            "All binary pseudo probes should belong a binary basic block");
 
     MatchedWithPseudoProbes.insert(BlendedHash.combine());
@@ -560,9 +560,9 @@ size_t matchWeightsByHashes(
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
   std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
-      IndexToBinaryPseudoProbes;
+      IndexToBBPseudoProbes;
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-      BinaryPseudoProbeToBlock;
+      BBPseudoProbeToBlock;
   const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     const BinaryBasicBlock *BB = BlockOrder[I];
@@ -598,8 +598,8 @@ size_t matchWeightsByHashes(
             continue;
           if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
             continue;
-          IndexToBinaryPseudoProbes[Probe.getIndex()].push_back(&Probe);
-          BinaryPseudoProbeToBlock[&Probe] = Blocks[I];
+          IndexToBBPseudoProbes[Probe.getIndex()].push_back(&Probe);
+          BBPseudoProbeToBlock[&Probe] = Blocks[I];
         }
       }
     }
@@ -624,8 +624,8 @@ size_t matchWeightsByHashes(
           : 0;
 
   StaleMatcher Matcher;
-  Matcher.init(Blocks, BlendedHashes, CallHashes, IndexToBinaryPseudoProbes,
-               BinaryPseudoProbeToBlock, YamlBFGUID);
+  Matcher.init(Blocks, BlendedHashes, CallHashes, IndexToBBPseudoProbes,
+               BBPseudoProbeToBlock, YamlBFGUID);
 
   // Index in yaml profile => corresponding (matched) block
   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;
@@ -683,8 +683,7 @@ size_t matchWeightsByHashes(
   }
 
   if (opts::Verbosity >= 2) {
-    outs() << "BOLT-INFO: "
-           << Matcher.getNumBlocksMatchedWithPseudoProbes()
+    outs() << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithPseudoProbes()
            << " blocks matched with pseudo probes\n"
            << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithOpcodes()
            << " blocks matched with opcodes\n";

From 2eb7bf2cff7c974a3327879fd46df7348fdb43e3 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 17:16:19 -0700
Subject: [PATCH 11/42] Rm unnecessary Blocks vec in StaleMatcher

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 4410fddaf0b21..6ee14ef0194bd 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -212,7 +212,6 @@ class StaleMatcher {
       if (CallHashes[I])
         CallHashToBlocks[CallHashes[I]].push_back(
             std::make_pair(Hashes[I], Block));
-      this->Blocks.push_back(Block);
     }
     this->IndexToBBPseudoProbes = IndexToBBPseudoProbes;
     this->BBPseudoProbeToBlock = BBPseudoProbeToBlock;
@@ -263,7 +262,6 @@ class StaleMatcher {
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
       BBPseudoProbeToBlock;
   std::unordered_set<uint64_t> MatchedWithPseudoProbes;
-  std::vector<const FlowBlock *> Blocks;
   uint64_t YamlBFGUID{0};
   uint64_t MatchedWithOpcodes{0};
 
@@ -341,11 +339,6 @@ class StaleMatcher {
       return nullptr;
     }
     uint64_t Index = BlockPseudoProbes[0]->Index;
-    if (Index > Blocks.size()) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: invalid index block pseudo probe index\n";
-      return nullptr;
-    }
     auto It = IndexToBBPseudoProbes.find(Index);
     if (It == IndexToBBPseudoProbes.end()) {
       if (opts::Verbosity >= 3)

From 212bd005b53b85596ffe84012546247db99e898f Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 17:28:01 -0700
Subject: [PATCH 12/42] Improved matched block counting

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 6ee14ef0194bd..06557b3c3a388 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -223,12 +223,19 @@ class StaleMatcher {
   matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
              const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
     const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
-    if (BestBlock)
+    if (BestBlock) {
       ++MatchedWithOpcodes;
-    BestBlock = BestBlock ? BestBlock : matchWithCalls(BlendedHash, CallHash);
-    return BestBlock || !YamlBFGUID
-               ? BestBlock
-               : matchWithPseudoProbes(BlendedHash, PseudoProbes);
+      return BestBlock;
+    }
+    BestBlock = matchWithCalls(BlendedHash, CallHash);
+    if (BestBlock) {
+      return BestBlock;
+    }
+    BestBlock = matchWithPseudoProbes(BlendedHash, PseudoProbes);
+    if (BestBlock) {
+      MatchedWithPseudoProbes.insert(BlendedHash.combine());
+    }
+    return BestBlock;
   }
 
   /// Returns true if the two basic blocks (in the binary and in the profile)
@@ -307,7 +314,7 @@ class StaleMatcher {
   // block.
   const FlowBlock *matchWithPseudoProbes(
       BlendedBlockHash BlendedHash,
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
+      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
@@ -357,7 +364,6 @@ class StaleMatcher {
     assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
            "All binary pseudo probes should belong a binary basic block");
 
-    MatchedWithPseudoProbes.insert(BlendedHash.combine());
     return BinaryPseudoProbeIt->second;
   }
 };

From eb6dfb973126a245aea21fae5369e06d628ddcdd Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 17:33:55 -0700
Subject: [PATCH 13/42] Removed comment from test

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp          | 6 +++---
 bolt/test/X86/match-blocks-with-pseudo-probes.test | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 06557b3c3a388..f3e19e8fb100d 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -252,14 +252,14 @@ class StaleMatcher {
            MatchedWithPseudoProbes.end();
   }
 
+  /// Returns the number of blocks matched with opcodes.
+  size_t getNumBlocksMatchedWithOpcodes() const { return MatchedWithOpcodes; }
+
   /// Returns the number of blocks matched with pseudo probes.
   size_t getNumBlocksMatchedWithPseudoProbes() const {
     return MatchedWithPseudoProbes.size();
   }
 
-  /// Returns the number of blocks matched with opcodes.
-  size_t getNumBlocksMatchedWithOpcodes() const { return MatchedWithOpcodes; }
-
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 83f9c20f31ba6..4a6f2f1cf129a 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -53,7 +53,7 @@ functions:
     exec:                   1
     nblocks:                6
     guid:                   0xDB956436E78DD5FA
-    pseudo_probe_desc_hash: 4294967295    #lookup in code in a second
+    pseudo_probe_desc_hash: 4294967295
     blocks:
       - bid:             1
         hash:            0x0000000000000001

From 16b5cfbbbb37820b00ad07b086481c46aefd0142 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Tue, 23 Jul 2024 17:48:12 -0700
Subject: [PATCH 14/42] Added comments and check for null YamlBFGUID in
 StaleMatcher before PseudoProbe matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index f3e19e8fb100d..b74c1c5071815 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -218,7 +218,8 @@ class StaleMatcher {
     this->YamlBFGUID = YamlBFGUID;
   }
 
-  /// Find the most similar block for a given hash.
+  /// Find the most similar flow block for a profile block given its hashes and
+  /// pseudo probe information.
   const FlowBlock *
   matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
              const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
@@ -315,6 +316,8 @@ class StaleMatcher {
   const FlowBlock *matchWithPseudoProbes(
       BlendedBlockHash BlendedHash,
       const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
+    if (!YamlBFGUID)
+      return nullptr;
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
@@ -582,6 +585,7 @@ size_t matchWeightsByHashes(
     Blocks.push_back(&Func.Blocks[I + 1]);
     BlendedBlockHash BlendedHash(BB->getHash());
     BlendedHashes.push_back(BlendedHash);
+    // Collects pseudo probes attached to the BB for use in the StaleMatcher.
     if (PseudoProbeDecoder) {
       const AddressProbesMap &ProbeMap =
           PseudoProbeDecoder->getAddress2ProbesMap();
@@ -607,6 +611,10 @@ size_t matchWeightsByHashes(
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
 
+  // Sets the YamlBFGUID in the StaleMatcher such that if either the profiled or
+  // binary function dne or they are not equal, to zero, as not to perform
+  // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
+  // pseudo probe block matching.
   uint64_t BFPseudoProbeDescHash = 0;
   if (BF.getGUID() != 0) {
     assert(PseudoProbeDecoder &&

From 799f20cf7ed8dfc30d89beadd90d91758cdc9485 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Wed, 24 Jul 2024 07:42:26 -0700
Subject: [PATCH 15/42] Omitting braces in one line if

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index b74c1c5071815..e2fd85373485b 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -229,9 +229,8 @@ class StaleMatcher {
       return BestBlock;
     }
     BestBlock = matchWithCalls(BlendedHash, CallHash);
-    if (BestBlock) {
+    if (BestBlock)
       return BestBlock;
-    }
     BestBlock = matchWithPseudoProbes(BlendedHash, PseudoProbes);
     if (BestBlock) {
       MatchedWithPseudoProbes.insert(BlendedHash.combine());

From 33f1b2ad7809786449b328e3eca93bb9e46694f4 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Wed, 24 Jul 2024 09:06:37 -0700
Subject: [PATCH 16/42] Omit unnecessary braces

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index b5ddef210d5e9..26cbd8250ce58 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -233,9 +233,8 @@ class StaleMatcher {
     if (BestBlock)
       return BestBlock;
     BestBlock = matchWithPseudoProbes(BlendedHash, PseudoProbes);
-    if (BestBlock) {
+    if (BestBlock)
       MatchedWithPseudoProbes.insert(BlendedHash.combine());
-    }
     return BestBlock;
   }
 

From 9889f8903c85d1eae4a1cd536bda03d6959c8ba4 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Wed, 24 Jul 2024 09:51:22 -0700
Subject: [PATCH 17/42] Change initialization of index -> probe and probe ->
 block mappings

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 81 ++++++++++++-----------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 26cbd8250ce58..6e7525b66aaae 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -193,16 +193,12 @@ struct BlendedBlockHash {
 /// release.
 class StaleMatcher {
 public:
+  StaleMatcher(const uint64_t YamlBFGUID) : YamlBFGUID(YamlBFGUID) {}
+
   /// Initialize stale matcher.
   void init(const std::vector<FlowBlock *> &Blocks,
             const std::vector<BlendedBlockHash> &Hashes,
-            const std::vector<uint64_t> &CallHashes,
-            const std::unordered_map<uint64_t,
-                                     std::vector<const MCDecodedPseudoProbe *>>
-                &IndexToBBPseudoProbes,
-            const std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-                &BBPseudoProbeToBlock,
-            const uint64_t YamlBFGUID) {
+            const std::vector<uint64_t> &CallHashes) {
     assert(Blocks.size() == Hashes.size() &&
            Hashes.size() == CallHashes.size() &&
            "incorrect matcher initialization");
@@ -214,9 +210,17 @@ class StaleMatcher {
         CallHashToBlocks[CallHashes[I]].push_back(
             std::make_pair(Hashes[I], Block));
     }
-    this->IndexToBBPseudoProbes = IndexToBBPseudoProbes;
-    this->BBPseudoProbeToBlock = BBPseudoProbeToBlock;
-    this->YamlBFGUID = YamlBFGUID;
+  }
+
+  /// Creates a mapping from a pseudo probe index to block pseudo probe in the
+  /// binary.
+  void mapIndexToProbe(uint64_t Index, const MCDecodedPseudoProbe *Probe) {
+    IndexToBBPseudoProbes[Index].push_back(Probe);
+  }
+
+  /// Creates a mapping from a pseudo probe to a flow block.
+  void mapProbeToBB(const MCDecodedPseudoProbe *Probe, FlowBlock *Block) {
+    BBPseudoProbeToBlock[Probe] = Block;
   }
 
   /// Find the most similar flow block for a profile block given its hashes and
@@ -269,7 +273,7 @@ class StaleMatcher {
   std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
       BBPseudoProbeToBlock;
   std::unordered_set<uint64_t> MatchedWithPseudoProbes;
-  uint64_t YamlBFGUID{0};
+  const uint64_t YamlBFGUID{0};
   uint64_t MatchedWithOpcodes{0};
 
   // Uses OpcodeHash to find the most similar block for a given hash.
@@ -557,14 +561,32 @@ size_t matchWeightsByHashes(
 
   assert(Func.Blocks.size() == BlockOrder.size() + 2);
 
+  // Sets the YamlBFGUID in the StaleMatcher such that if either the profiled or
+  // binary function dne or they are not equal, to zero, as not to perform
+  // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
+  // pseudo probe block matching.
+  const MCPseudoProbeDecoder *PseudoProbeDecoder =
+      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+  uint64_t BFPseudoProbeDescHash = 0;
+  if (opts::ProfileUsePseudoProbes && BF.getGUID() != 0) {
+    assert(PseudoProbeDecoder &&
+           "If BF has pseudo probe, BC should have a pseudo probe decoder");
+    auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
+    auto It = GUID2FuncDescMap.find(BF.getGUID());
+    if (It != GUID2FuncDescMap.end())
+      BFPseudoProbeDescHash = It->second.FuncHash;
+  }
+  uint64_t YamlBFGUID =
+      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&
+              BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
+          ? static_cast<uint64_t>(YamlBF.GUID)
+          : 0;
+
+  StaleMatcher Matcher(YamlBFGUID);
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
-  std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
-      IndexToBBPseudoProbes;
-  std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-      BBPseudoProbeToBlock;
-  const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
+
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     const BinaryBasicBlock *BB = BlockOrder[I];
     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
@@ -600,8 +622,8 @@ size_t matchWeightsByHashes(
             continue;
           if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
             continue;
-          IndexToBBPseudoProbes[Probe.getIndex()].push_back(&Probe);
-          BBPseudoProbeToBlock[&Probe] = Blocks[I];
+          Matcher.mapIndexToProbe(Probe.getIndex(), &Probe);
+          Matcher.mapProbeToBB(&Probe, Blocks[I]);
         }
       }
     }
@@ -610,28 +632,7 @@ size_t matchWeightsByHashes(
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
 
-  // Sets the YamlBFGUID in the StaleMatcher such that if either the profiled or
-  // binary function dne or they are not equal, to zero, as not to perform
-  // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
-  // pseudo probe block matching.
-  uint64_t BFPseudoProbeDescHash = 0;
-  if (opts::ProfileUsePseudoProbes && BF.getGUID() != 0) {
-    assert(PseudoProbeDecoder &&
-           "If BF has pseudo probe, BC should have a pseudo probe decoder");
-    auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
-    auto It = GUID2FuncDescMap.find(BF.getGUID());
-    if (It != GUID2FuncDescMap.end())
-      BFPseudoProbeDescHash = It->second.FuncHash;
-  }
-  uint64_t YamlBFGUID =
-      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&
-              BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
-          ? static_cast<uint64_t>(YamlBF.GUID)
-          : 0;
-
-  StaleMatcher Matcher;
-  Matcher.init(Blocks, BlendedHashes, CallHashes, IndexToBBPseudoProbes,
-               BBPseudoProbeToBlock, YamlBFGUID);
+  Matcher.init(Blocks, BlendedHashes, CallHashes);
 
   // Index in yaml profile => corresponding (matched) block
   DenseMap<uint64_t, const FlowBlock *> MatchedBlocks;

From 022c517af0278979b92b48d75503b4278880d040 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Wed, 24 Jul 2024 09:54:02 -0700
Subject: [PATCH 18/42] Formatting

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 6e7525b66aaae..37140a7dd1e57 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -586,7 +586,6 @@ size_t matchWeightsByHashes(
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
-
   for (uint64_t I = 0; I < BlockOrder.size(); I++) {
     const BinaryBasicBlock *BB = BlockOrder[I];
     assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock");
@@ -631,7 +630,6 @@ size_t matchWeightsByHashes(
     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
-
   Matcher.init(Blocks, BlendedHashes, CallHashes);
 
   // Index in yaml profile => corresponding (matched) block

From 5109893be3eaf62e756e4837b6a49b9aa4b0824b Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Wed, 24 Jul 2024 10:12:25 -0700
Subject: [PATCH 19/42] Comments

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 37140a7dd1e57..ba271cdbb2dc0 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -212,8 +212,7 @@ class StaleMatcher {
     }
   }
 
-  /// Creates a mapping from a pseudo probe index to block pseudo probe in the
-  /// binary.
+  /// Creates a mapping from a pseudo probe index to pseudo probe.
   void mapIndexToProbe(uint64_t Index, const MCDecodedPseudoProbe *Probe) {
     IndexToBBPseudoProbes[Index].push_back(Probe);
   }
@@ -321,12 +320,13 @@ class StaleMatcher {
       const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
     if (!YamlBFGUID)
       return nullptr;
-    // Searches for the pseudo probe attached to the matched function's block,
-    // ignoring pseudo probes attached to function calls and inlined functions'
-    // blocks.
+
     if (opts::Verbosity >= 3)
       outs() << "BOLT-INFO: attempting to match block with pseudo probes\n";
 
+    // Searches for the pseudo probe attached to the matched function's block,
+    // ignoring pseudo probes attached to function calls and inlined functions'
+    // blocks.
     std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
     for (const auto &PseudoProbe : PseudoProbes) {
       // Ensures that pseudo probe information belongs to the appropriate

From 5bf42207453830f9690895d9dd4efa78365c7bb1 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Wed, 24 Jul 2024 16:09:05 -0700
Subject: [PATCH 20/42] Changed std ADTs to LLVM

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index ba271cdbb2dc0..65eb3b0a419ce 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -267,11 +267,10 @@ class StaleMatcher {
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
-  std::unordered_map<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
+  DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
       IndexToBBPseudoProbes;
-  std::unordered_map<const MCDecodedPseudoProbe *, FlowBlock *>
-      BBPseudoProbeToBlock;
-  std::unordered_set<uint64_t> MatchedWithPseudoProbes;
+  DenseMap<const MCDecodedPseudoProbe *, FlowBlock *> BBPseudoProbeToBlock;
+  DenseSet<uint64_t> MatchedWithPseudoProbes;
   const uint64_t YamlBFGUID{0};
   uint64_t MatchedWithOpcodes{0};
 
@@ -327,7 +326,7 @@ class StaleMatcher {
     // Searches for the pseudo probe attached to the matched function's block,
     // ignoring pseudo probes attached to function calls and inlined functions'
     // blocks.
-    std::vector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
+    SmallVector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
     for (const auto &PseudoProbe : PseudoProbes) {
       // Ensures that pseudo probe information belongs to the appropriate
       // function and not an inlined function.

From f1179b11812841b6cdf78fc3c2e166279246cd08 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Wed, 24 Jul 2024 16:16:03 -0700
Subject: [PATCH 21/42] In matchWithPseudoProbe, hoist
 BlocksPseudoProbes.size(), added logging helpr

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 28 +++++++++++++----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 65eb3b0a419ce..7f45077bf7c86 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -338,30 +338,34 @@ class StaleMatcher {
 
       BlockPseudoProbes.push_back(&PseudoProbe);
     }
+
+    auto LogPseudoProbeBlockMatchFail = [&](std::string Message) {
+      if (opts::Verbosity >= 3)
+        errs() << Message;
+    };
     // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
     // probe and binary pseudo probe.
-    if (BlockPseudoProbes.size() == 0) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: no pseudo probes in profile block\n";
+    size_t NBlockPseudoProbes = BlockPseudoProbes.size();
+    if (NBlockPseudoProbes == 0) {
+      LogPseudoProbeBlockMatchFail(
+        "BOLT-WARNING: no pseudo probes in profile block\n");
       return nullptr;
     }
-    if (BlockPseudoProbes.size() > 1) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: more than 1 pseudo probes in profile block\n";
+    if (BNBlockPseudoProbes > 1) {
+      LogPseudoProbeBlockMatchFail(
+        "BOLT-WARNING: more than 1 pseudo probes in profile block\n");
       return nullptr;
     }
     uint64_t Index = BlockPseudoProbes[0]->Index;
     auto It = IndexToBBPseudoProbes.find(Index);
     if (It == IndexToBBPseudoProbes.end()) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: no block pseudo probes found within binary "
-                  "block at index\n";
+      LogPseudoProbeBlockMatchFail(
+        "BOLT-WARNING: no block pseudo probes found within BB at index\n");
       return nullptr;
     }
     if (It->second.size() > 1) {
-      if (opts::Verbosity >= 3)
-        errs() << "BOLT-WARNING: more than 1 block pseudo probes in binary "
-                  "block at index\n";
+      LogPseudoProbeBlockMatchFail(
+        "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n");
       return nullptr;
     }
     const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];

From 5076bab518abdf7994111026898d4d6c08f21e2b Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Wed, 24 Jul 2024 16:23:42 -0700
Subject: [PATCH 22/42] A more beautiful helper function for
 matchWithPseudoProbes

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 32 ++++++++++-------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 7f45077bf7c86..9d2584767c478 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -339,35 +339,31 @@ class StaleMatcher {
       BlockPseudoProbes.push_back(&PseudoProbe);
     }
 
-    auto LogPseudoProbeBlockMatchFail = [&](std::string Message) {
-      if (opts::Verbosity >= 3)
+    auto LogErrIfExpr = [&](bool Expr, std::string Message) -> bool {
+      if (Expr)
         errs() << Message;
+      return Expr;
     };
     // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
     // probe and binary pseudo probe.
     size_t NBlockPseudoProbes = BlockPseudoProbes.size();
-    if (NBlockPseudoProbes == 0) {
-      LogPseudoProbeBlockMatchFail(
-        "BOLT-WARNING: no pseudo probes in profile block\n");
+    if (LogErrIfExpr(NBlockPseudoProbes == 0,
+                     "BOLT-WARNING: no pseudo probes in profile block\n"))
       return nullptr;
-    }
-    if (BNBlockPseudoProbes > 1) {
-      LogPseudoProbeBlockMatchFail(
-        "BOLT-WARNING: more than 1 pseudo probes in profile block\n");
+    if (LogErrIfExpr(
+            NBlockPseudoProbes > 1,
+            "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
       return nullptr;
-    }
     uint64_t Index = BlockPseudoProbes[0]->Index;
     auto It = IndexToBBPseudoProbes.find(Index);
-    if (It == IndexToBBPseudoProbes.end()) {
-      LogPseudoProbeBlockMatchFail(
-        "BOLT-WARNING: no block pseudo probes found within BB at index\n");
+    if (LogErrIfExpr(
+            It == IndexToBBPseudoProbes.end(),
+            "BOLT-WARNING: no block pseudo probes found within BB at index\n"))
       return nullptr;
-    }
-    if (It->second.size() > 1) {
-      LogPseudoProbeBlockMatchFail(
-        "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n");
+    if (LogErrIfExpr(
+            It->second.size() > 1,
+            "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n"))
       return nullptr;
-    }
     const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
     auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
     assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&

From 4f2f64211c3aabe740e013e5d1be0dc6771f7f60 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Thu, 25 Jul 2024 10:02:04 -0700
Subject: [PATCH 23/42] Added inlined block pseudo probe matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 142 ++++++++++++++++------
 1 file changed, 105 insertions(+), 37 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 9d2584767c478..fd9be5f44fe61 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -212,9 +212,15 @@ class StaleMatcher {
     }
   }
 
+  /// Creates a mapping from a inlined pseudo probe's guid and index to probe.
+  void mapGUIDAndIndexToProbe(uint64_t Guid, uint64_t Index,
+                              const MCDecodedPseudoProbe *Probe) {
+    IndexAndGUIDToInlinedProbes[Guid][Index].push_back(Probe);
+  }
+
   /// Creates a mapping from a pseudo probe index to pseudo probe.
   void mapIndexToProbe(uint64_t Index, const MCDecodedPseudoProbe *Probe) {
-    IndexToBBPseudoProbes[Index].push_back(Probe);
+    IndexToProbes[Index].push_back(Probe);
   }
 
   /// Creates a mapping from a pseudo probe to a flow block.
@@ -267,8 +273,10 @@ class StaleMatcher {
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
-  DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>>
-      IndexToBBPseudoProbes;
+  DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>> IndexToProbes;
+  DenseMap<uint64_t,
+           DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>>>
+      IndexAndGUIDToInlinedProbes;
   DenseMap<const MCDecodedPseudoProbe *, FlowBlock *> BBPseudoProbeToBlock;
   DenseSet<uint64_t> MatchedWithPseudoProbes;
   const uint64_t YamlBFGUID{0};
@@ -312,40 +320,68 @@ class StaleMatcher {
     }
     return BestBlock;
   }
-  // Uses pseudo probe information to attach the profile to the appropriate
-  // block.
-  const FlowBlock *matchWithPseudoProbes(
-      BlendedBlockHash BlendedHash,
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
-    if (!YamlBFGUID)
-      return nullptr;
 
+  /// A helper function for logging.
+  static bool LogErrIfExpr(bool Expr, std::string Message) {
+    if (Expr)
+      errs() << Message;
+    return Expr;
+  }
+
+  /// Matches an inlined profile block with an inlined binary block based on
+  /// pseudo probes.
+  const FlowBlock *matchWithInlinedBlockPseudoProbes(
+      SmallVector<const yaml::bolt::PseudoProbeInfo *>
+          &InlinedBlockPseudoProbes) const {
     if (opts::Verbosity >= 3)
-      outs() << "BOLT-INFO: attempting to match block with pseudo probes\n";
+      outs() << "BOLT-INFO: attempting to match block with inlined block "
+                "pseudo probes\n";
 
-    // Searches for the pseudo probe attached to the matched function's block,
-    // ignoring pseudo probes attached to function calls and inlined functions'
-    // blocks.
-    SmallVector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
-    for (const auto &PseudoProbe : PseudoProbes) {
-      // Ensures that pseudo probe information belongs to the appropriate
-      // function and not an inlined function.
-      if (PseudoProbe.GUID != YamlBFGUID)
-        continue;
-      // Skips pseudo probes attached to function calls.
-      if (PseudoProbe.Type != static_cast<uint8_t>(PseudoProbeType::Block))
-        continue;
+    size_t NInlinedBlockPseudoProbes = InlinedBlockPseudoProbes.size();
+    if (LogErrIfExpr(NInlinedBlockPseudoProbes == 0,
+                     "BOLT-WARNING: no pseudo probes in profile block\n"))
+      return nullptr;
+    if (LogErrIfExpr(
+            NInlinedBlockPseudoProbes > 1,
+            "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
+      return nullptr;
 
-      BlockPseudoProbes.push_back(&PseudoProbe);
-    }
+    const auto *InlinedPseudoProbe = InlinedBlockPseudoProbes[0];
+    uint64_t Guid = InlinedPseudoProbe->GUID;
+    uint64_t Index = InlinedPseudoProbe->Index;
+
+    auto GuidIt = IndexAndGUIDToInlinedProbes.find(Guid);
+    if (LogErrIfExpr(
+            GuidIt == IndexAndGUIDToInlinedProbes.end(),
+            "BOLT-WARNING: no pseudo probes found within BB at index\n"))
+      return nullptr;
+    auto IndexIt = GuidIt->second.find(Index);
+    if (LogErrIfExpr(
+            IndexIt == GuidIt->second.end(),
+            "BOLT-WARNING: no pseudo probes found within BB at index\n"))
+      return nullptr;
+
+    if (LogErrIfExpr(
+            IndexIt->second.size() > 1,
+            "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n"))
+      return nullptr;
+
+    const MCDecodedPseudoProbe *BinaryPseudoProbe = IndexIt->second[0];
+    auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
+    assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
+           "All binary pseudo probes should belong a binary basic block");
+
+    return BinaryPseudoProbeIt->second;
+  }
+
+  /// Matches a profile block with an binary block based on pseudo probes.
+  const FlowBlock *matchWithNonInlinedBlockPseudoProbes(
+      SmallVector<const yaml::bolt::PseudoProbeInfo *> &BlockPseudoProbes)
+      const {
+    if (opts::Verbosity >= 3)
+      outs() << "BOLT-INFO: attempting to match block with inlined block "
+                "pseudo probes\n";
 
-    auto LogErrIfExpr = [&](bool Expr, std::string Message) -> bool {
-      if (Expr)
-        errs() << Message;
-      return Expr;
-    };
-    // Returns nullptr if there is not a 1:1 mapping of the yaml block pseudo
-    // probe and binary pseudo probe.
     size_t NBlockPseudoProbes = BlockPseudoProbes.size();
     if (LogErrIfExpr(NBlockPseudoProbes == 0,
                      "BOLT-WARNING: no pseudo probes in profile block\n"))
@@ -355,9 +391,9 @@ class StaleMatcher {
             "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
       return nullptr;
     uint64_t Index = BlockPseudoProbes[0]->Index;
-    auto It = IndexToBBPseudoProbes.find(Index);
+    auto It = IndexToProbes.find(Index);
     if (LogErrIfExpr(
-            It == IndexToBBPseudoProbes.end(),
+            It == IndexToProbes.end(),
             "BOLT-WARNING: no block pseudo probes found within BB at index\n"))
       return nullptr;
     if (LogErrIfExpr(
@@ -371,6 +407,36 @@ class StaleMatcher {
 
     return BinaryPseudoProbeIt->second;
   }
+
+  /// Uses pseudo probe information to attach the profile to the appropriate
+  /// block.
+  const FlowBlock *matchWithPseudoProbes(
+      BlendedBlockHash BlendedHash,
+      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
+    if (!YamlBFGUID)
+      return nullptr;
+
+    // Searches for the pseudo probe attached to the matched function's block.
+    SmallVector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
+    SmallVector<const yaml::bolt::PseudoProbeInfo *> InlinedBlockPseudoProbes;
+    for (const auto &PseudoProbe : PseudoProbes) {
+      // Skips pseudo probes attached to function calls.
+      if (PseudoProbe.Type != static_cast<uint8_t>(PseudoProbeType::Block))
+        continue;
+      if (PseudoProbe.GUID != YamlBFGUID)
+        InlinedBlockPseudoProbes.push_back(&PseudoProbe);
+      else
+        BlockPseudoProbes.push_back(&PseudoProbe);
+    }
+
+    // Returns nullptr if there is not a 1:1 mapping of the profile block pseudo
+    // probe and a binary block pseudo probe.
+    const FlowBlock *MatchedInlinedBlock =
+        matchWithInlinedBlockPseudoProbes(InlinedBlockPseudoProbes);
+    return MatchedInlinedBlock
+               ? MatchedInlinedBlock
+               : matchWithNonInlinedBlockPseudoProbes(BlockPseudoProbes);
+  }
 };
 
 void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const {
@@ -616,11 +682,13 @@ size_t matchWeightsByHashes(
                            ProbeMap.lower_bound(FuncAddr + BlockRange.second));
       for (const auto &[_, Probes] : BlockProbes) {
         for (const MCDecodedPseudoProbe &Probe : Probes) {
-          if (Probe.getInlineTreeNode()->hasInlineSite())
-            continue;
           if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
             continue;
-          Matcher.mapIndexToProbe(Probe.getIndex(), &Probe);
+          if (Probe.getInlineTreeNode()->hasInlineSite())
+            Matcher.mapGUIDAndIndexToProbe(Probe.getGuid(), Probe.getIndex(),
+                                           &Probe);
+          else
+            Matcher.mapIndexToProbe(Probe.getIndex(), &Probe);
           Matcher.mapProbeToBB(&Probe, Blocks[I]);
         }
       }

From 327eb81f46912cf3a52f3228d6a40ba8925e7fa0 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Thu, 25 Jul 2024 10:31:56 -0700
Subject: [PATCH 24/42] Added flag to trigger pseudo probe block matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp          | 10 ++++++++--
 bolt/test/X86/match-blocks-with-pseudo-probes.test |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index fd9be5f44fe61..99c1523ee7163 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -118,6 +118,11 @@ cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
         "The cost of increasing an unknown fall-through jump count by one."),
     cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory));
 
+cl::opt<bool> StaleMatchingWithBlockPseudoProbes(
+    "stale-matching-with-block-pseudo-probes",
+    cl::desc("Turns on stale matching with block pseudo probes."), cl::init(3),
+    cl::ReallyHidden, cl::cat(BoltOptCategory));
+
 } // namespace opts
 
 namespace llvm {
@@ -413,7 +418,7 @@ class StaleMatcher {
   const FlowBlock *matchWithPseudoProbes(
       BlendedBlockHash BlendedHash,
       const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
-    if (!YamlBFGUID)
+    if (!opts::StaleMatchingWithBlockPseudoProbes || !YamlBFGUID)
       return nullptr;
 
     // Searches for the pseudo probe attached to the matched function's block.
@@ -671,7 +676,8 @@ size_t matchWeightsByHashes(
     BlendedBlockHash BlendedHash(BB->getHash());
     BlendedHashes.push_back(BlendedHash);
     // Collects pseudo probes attached to the BB for use in the StaleMatcher.
-    if (opts::ProfileUsePseudoProbes && PseudoProbeDecoder) {
+    if (opts::ProfileUsePseudoProbes &&
+        opts::StaleMatchingWithBlockPseudoProbes && PseudoProbeDecoder) {
       const AddressProbesMap &ProbeMap =
           PseudoProbeDecoder->getAddress2ProbesMap();
       const uint64_t FuncAddr = BF.getAddress();
diff --git a/bolt/test/X86/match-blocks-with-pseudo-probes.test b/bolt/test/X86/match-blocks-with-pseudo-probes.test
index 9b73117271b55..9bb1334876e09 100644
--- a/bolt/test/X86/match-blocks-with-pseudo-probes.test
+++ b/bolt/test/X86/match-blocks-with-pseudo-probes.test
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
 # RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml -v=2 \
-# RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 --infer-stale-profile --profile-use-pseudo-probes 2>&1 | FileCheck %s
+# RUN:   --print-cfg --funcs=main --profile-ignore-hash=0 --infer-stale-profile --profile-use-pseudo-probes --stale-matching-with-block-pseudo-probes 2>&1 | FileCheck %s
 
 # CHECK: BOLT-INFO: inference found a pseudo probe match for 100.00% of basic blocks (1 out of 1 stale) responsible for -nan% samples (0 out of 0 stale)
 

From 37793aaa4c371cda1201c5d18f9bc233df1895dd Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Thu, 25 Jul 2024 14:30:54 -0700
Subject: [PATCH 25/42] Added flag for pseudo probe block matching

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 99c1523ee7163..c5558bf923ea0 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -433,7 +433,6 @@ class StaleMatcher {
       else
         BlockPseudoProbes.push_back(&PseudoProbe);
     }
-
     // Returns nullptr if there is not a 1:1 mapping of the profile block pseudo
     // probe and a binary block pseudo probe.
     const FlowBlock *MatchedInlinedBlock =
@@ -636,9 +635,12 @@ size_t matchWeightsByHashes(
   // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
   // pseudo probe block matching.
   const MCPseudoProbeDecoder *PseudoProbeDecoder =
-      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+      opts::ProfileUsePseudoProbes && opts::StaleMatchingWithBlockPseudoProbes
+          ? BC.getPseudoProbeDecoder()
+          : nullptr;
   uint64_t BFPseudoProbeDescHash = 0;
-  if (opts::ProfileUsePseudoProbes && BF.getGUID() != 0) {
+  if (opts::ProfileUsePseudoProbes &&
+      opts::StaleMatchingWithBlockPseudoProbes && BF.getGUID() != 0) {
     assert(PseudoProbeDecoder &&
            "If BF has pseudo probe, BC should have a pseudo probe decoder");
     auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
@@ -646,13 +648,8 @@ size_t matchWeightsByHashes(
     if (It != GUID2FuncDescMap.end())
       BFPseudoProbeDescHash = It->second.FuncHash;
   }
-  uint64_t YamlBFGUID =
-      BFPseudoProbeDescHash && YamlBF.PseudoProbeDescHash &&
-              BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash
-          ? static_cast<uint64_t>(YamlBF.GUID)
-          : 0;
 
-  StaleMatcher Matcher(YamlBFGUID);
+  StaleMatcher Matcher(YamlBF.GUID);
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
@@ -677,7 +674,11 @@ size_t matchWeightsByHashes(
     BlendedHashes.push_back(BlendedHash);
     // Collects pseudo probes attached to the BB for use in the StaleMatcher.
     if (opts::ProfileUsePseudoProbes &&
-        opts::StaleMatchingWithBlockPseudoProbes && PseudoProbeDecoder) {
+        opts::StaleMatchingWithBlockPseudoProbes && BFPseudoProbeDescHash &&
+        YamlBF.PseudoProbeDescHash &&
+        BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash) {
+      assert(PseudoProbeDecoder &&
+             "If pseudo probes are in use, psuedo probe decoder should exist");
       const AddressProbesMap &ProbeMap =
           PseudoProbeDecoder->getAddress2ProbesMap();
       const uint64_t FuncAddr = BF.getAddress();

From ba00b22aaec16f3b50f7b5e084e1c8683221ba20 Mon Sep 17 00:00:00 2001
From: shawbyoung <shawbyoung@gmail.com>
Date: Thu, 25 Jul 2024 14:38:29 -0700
Subject: [PATCH 26/42] Set flag init val, changed std::string to StringRef

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index c5558bf923ea0..ef9320ae168fe 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -120,8 +120,8 @@ cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
 
 cl::opt<bool> StaleMatchingWithBlockPseudoProbes(
     "stale-matching-with-block-pseudo-probes",
-    cl::desc("Turns on stale matching with block pseudo probes."), cl::init(3),
-    cl::ReallyHidden, cl::cat(BoltOptCategory));
+    cl::desc("Turns on stale matching with block pseudo probes."),
+    cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory));
 
 } // namespace opts
 
@@ -327,7 +327,7 @@ class StaleMatcher {
   }
 
   /// A helper function for logging.
-  static bool LogErrIfExpr(bool Expr, std::string Message) {
+  static bool LogErrIfExpr(bool Expr, StringRef Message) {
     if (Expr)
       errs() << Message;
     return Expr;

From 5e47249c00c6f0825c19496e628d1f31d56894c9 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Wed, 31 Jul 2024 22:12:34 -0700
Subject: [PATCH 27/42] [BOLT][NFC] Add timers for MetadataManager invocations

Test Plan: added bolt/test/timers.c

Reviewers: ayermolo, maksfb, rafaelauler, dcci

Reviewed By: dcci

Pull Request: https://github.com/llvm/llvm-project/pull/101267
---
 bolt/lib/Rewrite/RewriteInstance.cpp | 10 ++++++++++
 bolt/test/timers.c                   | 15 +++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 bolt/test/timers.c

diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 33ebae3b6e6de..b7e361c35088a 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -3131,18 +3131,24 @@ void RewriteInstance::initializeMetadataManager() {
 }
 
 void RewriteInstance::processSectionMetadata() {
+  NamedRegionTimer T("processmetadata-section", "process section metadata",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   initializeMetadataManager();
 
   MetadataManager.runSectionInitializers();
 }
 
 void RewriteInstance::processMetadataPreCFG() {
+  NamedRegionTimer T("processmetadata-precfg", "process metadata pre-CFG",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runInitializersPreCFG();
 
   processProfileDataPreCFG();
 }
 
 void RewriteInstance::processMetadataPostCFG() {
+  NamedRegionTimer T("processmetadata-postcfg", "process metadata post-CFG",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runInitializersPostCFG();
 }
 
@@ -3536,10 +3542,14 @@ void RewriteInstance::emitAndLink() {
 }
 
 void RewriteInstance::finalizeMetadataPreEmit() {
+  NamedRegionTimer T("finalizemetadata-preemit", "finalize metadata pre-emit",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runFinalizersPreEmit();
 }
 
 void RewriteInstance::updateMetadata() {
+  NamedRegionTimer T("updatemetadata-postemit", "update metadata post-emit",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
   MetadataManager.runFinalizersAfterEmit();
 
   if (opts::UpdateDebugSections) {
diff --git a/bolt/test/timers.c b/bolt/test/timers.c
new file mode 100644
index 0000000000000..b16218dd7ea76
--- /dev/null
+++ b/bolt/test/timers.c
@@ -0,0 +1,15 @@
+/* This test checks timers for metadata manager phases.
+# RUN: %clang %cflags %s -o %t.exe
+# RUN: link_fdata %s %t.exe %t.fdata
+# RUN: llvm-bolt %t.exe -o %t.null --data %t.fdata -w %t.yaml --time-rewrite \
+# RUN:   2>&1 | FileCheck %s
+
+# CHECK-DAG: update metadata post-emit
+# CHECK-DAG: process section metadata
+# CHECK-DAG: process metadata pre-CFG
+# CHECK-DAG: process metadata post-CFG
+# CHECK-DAG: finalize metadata pre-emit
+
+# FDATA: 0 [unknown] 0 1 main 0 1 0
+*/
+int main() { return 0; }

From 3902effbfc181bdac5e2131e8583dca99a33d573 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 26 Aug 2024 09:05:34 -0700
Subject: [PATCH 28/42] [MC][NFC] Count pseudo probes and function records
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-parse pseudo probes section counting the number of probes and
function records. These numbers are used in follow-up diff to
pre-allocate vectors for decoded probes and inline tree nodes.

Additional benefit is avoiding error handling during parsing.

This pre-parsing is fast: for a 404MiB .pseudo_probe section with
43373881 probes and 25228770 function records, it only takes 0.68±0.01s.
The total time of buildAddress2ProbeMap is 21s.

Reviewers: dcci, maksfb, rafaelauler, wlei-llvm, ayermolo

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102774
---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp |   1 -
 llvm/include/llvm/MC/MCPseudoProbe.h     |   6 +
 llvm/lib/MC/MCPseudoProbe.cpp            | 143 +++++++++++++++++------
 3 files changed, 113 insertions(+), 37 deletions(-)

diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 886bbdbf9d686..37a5b937ebcaa 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -143,7 +143,6 @@ void PseudoProbeRewriter::parsePseudoProbe() {
   if (!ProbeDecoder.buildAddress2ProbeMap(
           reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size(),
           GuidFilter, FuncStartAddrs)) {
-    ProbeDecoder.getAddress2ProbesMap().clear();
     errs() << "BOLT-WARNING: fail in building Address2ProbeMap\n";
     return;
   }
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 5344dea4141b3..44692bb183d5a 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -371,6 +371,12 @@ class MCPseudoProbeDecoder {
   // Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map.
   bool buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size);
 
+  // Decode pseudo_probe section to count the number of probes and inlined
+  // function records for each function record.
+  template <bool IsTopLevelFunc>
+  bool countRecords(bool &Discard, uint32_t &ProbeCount, uint32_t &InlinedCount,
+                    const Uint64Set &GuidFilter);
+
   // Decode pseudo_probe section to build address to probes map for specifed
   // functions only.
   bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size,
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index a5a030e19b849..21c6c4f766c30 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
@@ -429,17 +430,11 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     Index = Cur->getChildren().size();
   } else {
     // Read inline site for inlinees
-    auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
-    if (!ErrorOrIndex)
-      return false;
-    Index = std::move(*ErrorOrIndex);
+    Index = cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
   }
 
   // Read guid
-  auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
-  if (!ErrorOrCurGuid)
-    return false;
-  uint64_t Guid = std::move(*ErrorOrCurGuid);
+  uint64_t Guid = cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
 
   // Decide if top-level node should be disgarded.
   if (IsTopLevelFunc && !GuidFilter.empty() && !GuidFilter.count(Guid))
@@ -457,41 +452,27 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   }
 
   // Read number of probes in the current node.
-  auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
-  if (!ErrorOrNodeCount)
-    return false;
-  uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+  uint32_t NodeCount =
+      cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
   // Read number of direct inlinees
-  auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
-  if (!ErrorOrCurChildrenToProcess)
-    return false;
+  uint32_t ChildrenToProcess =
+      cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
   // Read all probes in this node
   for (std::size_t I = 0; I < NodeCount; I++) {
     // Read index
-    auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
-    if (!ErrorOrIndex)
-      return false;
-    uint32_t Index = std::move(*ErrorOrIndex);
+    uint32_t Index =
+        cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
     // Read type | flag.
-    auto ErrorOrValue = readUnencodedNumber<uint8_t>();
-    if (!ErrorOrValue)
-      return false;
-    uint8_t Value = std::move(*ErrorOrValue);
+    uint8_t Value = cantFail(errorOrToExpected(readUnencodedNumber<uint8_t>()));
     uint8_t Kind = Value & 0xf;
     uint8_t Attr = (Value & 0x70) >> 4;
     // Read address
     uint64_t Addr = 0;
     if (Value & 0x80) {
-      auto ErrorOrOffset = readSignedNumber<int64_t>();
-      if (!ErrorOrOffset)
-        return false;
-      int64_t Offset = std::move(*ErrorOrOffset);
+      int64_t Offset = cantFail(errorOrToExpected(readSignedNumber<int64_t>()));
       Addr = LastAddr + Offset;
     } else {
-      auto ErrorOrAddr = readUnencodedNumber<int64_t>();
-      if (!ErrorOrAddr)
-        return false;
-      Addr = std::move(*ErrorOrAddr);
+      Addr = cantFail(errorOrToExpected(readUnencodedNumber<int64_t>()));
       if (isSentinelProbe(Attr)) {
         // For sentinel probe, the addr field actually stores the GUID of the
         // split function. Convert it to the real address.
@@ -508,10 +489,8 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
 
     uint32_t Discriminator = 0;
     if (hasDiscriminator(Attr)) {
-      auto ErrorOrDiscriminator = readUnsignedNumber<uint32_t>();
-      if (!ErrorOrDiscriminator)
-        return false;
-      Discriminator = std::move(*ErrorOrDiscriminator);
+      Discriminator =
+          cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
     }
 
     if (Cur && !isSentinelProbe(Attr)) {
@@ -524,17 +503,109 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     LastAddr = Addr;
   }
 
-  uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
   for (uint32_t I = 0; I < ChildrenToProcess; I++) {
     buildAddress2ProbeMap(Cur, LastAddr, GuidFilter, FuncStartAddrs);
   }
+  return true;
+}
+
+template <bool IsTopLevelFunc>
+bool MCPseudoProbeDecoder::countRecords(bool &Discard, uint32_t &ProbeCount,
+                                        uint32_t &InlinedCount,
+                                        const Uint64Set &GuidFilter) {
+  if (!IsTopLevelFunc)
+    // Read inline site for inlinees
+    if (!readUnsignedNumber<uint32_t>())
+      return false;
+
+  // Read guid
+  auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
+  if (!ErrorOrCurGuid)
+    return false;
+  uint64_t Guid = std::move(*ErrorOrCurGuid);
+
+  // Decide if top-level node should be disgarded.
+  if (IsTopLevelFunc) {
+    Discard = !GuidFilter.empty() && !GuidFilter.count(Guid);
+    if (!Discard)
+      // Allocate an entry for top-level function record.
+      ++InlinedCount;
+  }
+
+  // Read number of probes in the current node.
+  auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
+  if (!ErrorOrNodeCount)
+    return false;
+  uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+  uint32_t CurrentProbeCount = 0;
+
+  // Read number of direct inlinees
+  auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
+  if (!ErrorOrCurChildrenToProcess)
+    return false;
+  uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
+
+  // Read all probes in this node
+  for (std::size_t I = 0; I < NodeCount; I++) {
+    // Read index
+    if (!readUnsignedNumber<uint32_t>())
+      return false;
+
+    // Read type | flag.
+    auto ErrorOrValue = readUnencodedNumber<uint8_t>();
+    if (!ErrorOrValue)
+      return false;
+    uint8_t Value = std::move(*ErrorOrValue);
+
+    uint8_t Attr = (Value & 0x70) >> 4;
+    if (Value & 0x80) {
+      // Offset
+      if (!readSignedNumber<int64_t>())
+        return false;
+    } else {
+      // Addr
+      if (!readUnencodedNumber<int64_t>())
+        return false;
+    }
+
+    if (hasDiscriminator(Attr))
+      // Discriminator
+      if (!readUnsignedNumber<uint32_t>())
+        return false;
+
+    if (!Discard && !isSentinelProbe(Attr))
+      ++CurrentProbeCount;
+  }
 
+  if (!Discard) {
+    ProbeCount += CurrentProbeCount;
+    InlinedCount += ChildrenToProcess;
+  }
+
+  for (uint32_t I = 0; I < ChildrenToProcess; I++)
+    if (!countRecords<false>(Discard, ProbeCount, InlinedCount, GuidFilter))
+      return false;
   return true;
 }
 
 bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     const uint8_t *Start, std::size_t Size, const Uint64Set &GuidFilter,
     const Uint64Map &FuncStartAddrs) {
+  // For function records in the order of their appearance in the encoded data
+  // (DFS), count the number of contained probes and inlined function records.
+  uint32_t ProbeCount = 0;
+  uint32_t InlinedCount = 0;
+  uint32_t TopLevelFuncs = 0;
+  Data = Start;
+  End = Data + Size;
+  bool Discard = false;
+  while (Data < End) {
+    if (!countRecords<true>(Discard, ProbeCount, InlinedCount, GuidFilter))
+      return false;
+    TopLevelFuncs += !Discard;
+  }
+  assert(Data == End && "Have unprocessed data in pseudo_probe section");
+
   Data = Start;
   End = Data + Size;
   uint64_t LastAddr = 0;

From d20d4d6598c3546be964a2df638f4418645bc0b7 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 25 Jul 2024 16:36:43 -0700
Subject: [PATCH 29/42] [MC][NFC] Drop unused
 MCDecodedPseudoProbeInlineTree::ChildrenToProcess (#100576)

The usage was removed in 3f97016857b0305294f3a55ea220884fb50ce033.

Results in a slight peak RSS reduction in
`perf2bolt --profile-use-pseudo-probes` from 17.24 to 16.85 GiB.
---
 llvm/include/llvm/MC/MCPseudoProbe.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 44692bb183d5a..f3539b23b8a35 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -280,8 +280,6 @@ class MCDecodedPseudoProbeInlineTree
                                          MCDecodedPseudoProbeInlineTree> {
 public:
   InlineSite ISite;
-  // Used for decoding
-  uint32_t ChildrenToProcess = 0;
 
   MCDecodedPseudoProbeInlineTree() = default;
   MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};

From a857d324de090fe9723e999eadc6cb29d8141a93 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Sat, 10 Aug 2024 23:45:47 -0700
Subject: [PATCH 30/42] [profgen][NFC] Pass parameter as const_ref

Pass `ProbeNode` parameter of `trackInlineesOptimizedAway` as const
reference.

Reviewers: wlei-llvm, WenleiHe

Reviewed By: WenleiHe

Pull Request: https://github.com/llvm/llvm-project/pull/102787
---
 llvm/include/llvm/MC/MCPseudoProbe.h       | 1 +
 llvm/tools/llvm-profgen/ProfiledBinary.cpp | 3 ++-
 llvm/tools/llvm-profgen/ProfiledBinary.h   | 7 ++++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index f3539b23b8a35..3dd10c0717679 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -241,6 +241,7 @@ class MCPseudoProbeInlineTreeBase {
   InlinedProbeTreeMap &getChildren() { return Children; }
   const InlinedProbeTreeMap &getChildren() const { return Children; }
   std::vector<ProbeType> &getProbes() { return Probes; }
+  const std::vector<ProbeType> &getProbes() const { return Probes; }
   void addProbes(ProbeType Probe) { Probes.push_back(Probe); }
   // Caller node of the inline site
   MCPseudoProbeInlineTreeBase<ProbeType, DerivedProbeInlineTreeType> *Parent =
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 632ddc7b50f54..574a9c9f52bf1 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -137,7 +137,8 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
 
 void BinarySizeContextTracker::trackInlineesOptimizedAway(
     MCPseudoProbeDecoder &ProbeDecoder,
-    MCDecodedPseudoProbeInlineTree &ProbeNode, ProbeFrameStack &ProbeContext) {
+    const MCDecodedPseudoProbeInlineTree &ProbeNode,
+    ProbeFrameStack &ProbeContext) {
   StringRef FuncName =
       ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName;
   ProbeContext.emplace_back(FuncName, 0);
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index f2eeca4545459..0588cb48b2af6 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -167,9 +167,10 @@ class BinarySizeContextTracker {
   void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder);
 
   using ProbeFrameStack = SmallVector<std::pair<StringRef, uint32_t>>;
-  void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder,
-                                  MCDecodedPseudoProbeInlineTree &ProbeNode,
-                                  ProbeFrameStack &Context);
+  void
+  trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder,
+                             const MCDecodedPseudoProbeInlineTree &ProbeNode,
+                             ProbeFrameStack &Context);
 
   void dump() { RootContext.dumpTree(); }
 

From cddea6a015b94140e96dee4d0fa902f8536c0a81 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 26 Aug 2024 09:09:13 -0700
Subject: [PATCH 31/42] [MC][NFC] Statically allocate storage for decoded
 pseudo probes and function records

Use #102774 to allocate storage for decoded probes (`PseudoProbeVec`)
and function records (`InlineTreeVec`).

Leverage that to also shrink sizes of `MCDecodedPseudoProbe`:
- Drop Guid since it's accessible via `InlineTree`.

`MCDecodedPseudoProbeInlineTree`:
- Keep track of probes and inlinees using `ArrayRef`s now that probes
  and function records belonging to the same function are allocated
  contiguously.

This reduces peak RSS from 13.7 GiB to 9.7 GiB and pseudo probe parsing
time (as part of perf2bolt) from 15.3s to 9.6s for a large binary with
400MiB .pseudo_probe section containing 43M probes and 25M function
records.

Depends on:
#102774
#102787
#102788

Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102789
---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp     |  30 ++--
 llvm/include/llvm/MC/MCPseudoProbe.h         | 136 +++++++++++++------
 llvm/lib/MC/MCPseudoProbe.cpp                |  58 +++++---
 llvm/tools/llvm-profgen/ProfileGenerator.cpp |   6 +-
 llvm/tools/llvm-profgen/ProfiledBinary.cpp   |  10 +-
 5 files changed, 164 insertions(+), 76 deletions(-)

diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 37a5b937ebcaa..9677530919b90 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -200,7 +200,9 @@ void PseudoProbeRewriter::updatePseudoProbes() {
     }
 
     unsigned ProbeTrack = AP.second.size();
-    std::list<MCDecodedPseudoProbe>::iterator Probe = AP.second.begin();
+    auto Probe = llvm::map_iterator(
+        AP.second.begin(),
+        [](auto RW) -> MCDecodedPseudoProbe & { return RW.get(); });
     while (ProbeTrack != 0) {
       if (Probe->isBlock()) {
         Probe->setAddress(BlkOutputAddress);
@@ -218,9 +220,7 @@ void PseudoProbeRewriter::updatePseudoProbes() {
         }
 
         while (CallOutputAddress != CallOutputAddresses.second) {
-          AP.second.push_back(*Probe);
-          AP.second.back().setAddress(CallOutputAddress->second);
-          Probe->getInlineTreeNode()->addProbes(&(AP.second.back()));
+          ProbeDecoder.addInjectedProbe(*Probe, CallOutputAddress->second);
           CallOutputAddress = std::next(CallOutputAddress);
         }
       }
@@ -332,7 +332,7 @@ void PseudoProbeRewriter::encodePseudoProbes() {
       ProbeDecoder.getDummyInlineRoot();
   for (auto Child = Root.getChildren().begin();
        Child != Root.getChildren().end(); ++Child)
-    Inlinees[Child->first] = Child->second.get();
+    Inlinees[Child->getInlineSite()] = &*Child;
 
   for (auto Inlinee : Inlinees)
     // INT64_MAX is "placeholder" of unused callsite index field in the pair
@@ -358,25 +358,37 @@ void PseudoProbeRewriter::encodePseudoProbes() {
     EmitInt(Cur->Guid, 8);
     // Emit number of probes in this node
     uint64_t Deleted = 0;
-    for (MCDecodedPseudoProbe *&Probe : Cur->getProbes())
+    for (MCDecodedPseudoProbe *&Probe :
+         llvm::make_pointer_range(Cur->getProbes()))
       if (Probe->getAddress() == INT64_MAX)
         Deleted++;
     LLVM_DEBUG(dbgs() << "Deleted Probes:" << Deleted << "\n");
-    uint64_t ProbesSize = Cur->getProbes().size() - Deleted;
+    size_t InjectedProbes = ProbeDecoder.getNumInjectedProbes(Cur);
+    uint64_t ProbesSize = Cur->getProbes().size() - Deleted + InjectedProbes;
     EmitULEB128IntValue(ProbesSize);
     // Emit number of direct inlinees
     EmitULEB128IntValue(Cur->getChildren().size());
     // Emit probes in this group
-    for (MCDecodedPseudoProbe *&Probe : Cur->getProbes()) {
+    for (MCDecodedPseudoProbe *&Probe :
+         llvm::make_pointer_range(Cur->getProbes())) {
       if (Probe->getAddress() == INT64_MAX)
         continue;
       EmitDecodedPseudoProbe(Probe);
       LastProbe = Probe;
     }
+    if (InjectedProbes) {
+      for (MCDecodedPseudoProbe *&Probe :
+           llvm::make_pointer_range(ProbeDecoder.getInjectedProbes(Cur))) {
+        if (Probe->getAddress() == INT64_MAX)
+          continue;
+        EmitDecodedPseudoProbe(Probe);
+        LastProbe = Probe;
+      }
+    }
 
     for (auto Child = Cur->getChildren().begin();
          Child != Cur->getChildren().end(); ++Child)
-      Inlinees[Child->first] = Child->second.get();
+      Inlinees[Child->getInlineSite()] = &*Child;
     for (const auto &Inlinee : Inlinees) {
       assert(Cur->Guid != 0 && "non root tree node must have nonzero Guid");
       NextNodes.push_back({std::get<1>(Inlinee.first), Inlinee.second});
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 3dd10c0717679..66ad9db4860d8 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -54,20 +54,21 @@
 #ifndef LLVM_MC_MCPSEUDOPROBE_H
 #define LLVM_MC_MCPSEUDOPROBE_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/Support/ErrorOr.h"
-#include <list>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <type_traits>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -103,14 +104,15 @@ using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 using GUIDProbeFunctionMap =
     std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
 // Address to pseudo probes map.
-using AddressProbesMap = std::map<uint64_t, std::list<MCDecodedPseudoProbe>>;
+using AddressProbesMap =
+    std::map<uint64_t,
+             std::vector<std::reference_wrapper<MCDecodedPseudoProbe>>>;
 
 class MCDecodedPseudoProbeInlineTree;
 
 class MCPseudoProbeBase {
 protected:
-  uint64_t Guid;
-  uint64_t Index;
+  uint32_t Index;
   uint32_t Discriminator;
   uint8_t Attributes;
   uint8_t Type;
@@ -120,14 +122,12 @@ class MCPseudoProbeBase {
   const static uint32_t PseudoProbeFirstId = 1;
 
 public:
-  MCPseudoProbeBase(uint64_t G, uint64_t I, uint64_t At, uint8_t T, uint32_t D)
-      : Guid(G), Index(I), Discriminator(D), Attributes(At), Type(T) {}
+  MCPseudoProbeBase(uint64_t I, uint64_t At, uint8_t T, uint32_t D)
+      : Index(I), Discriminator(D), Attributes(At), Type(T) {}
 
   bool isEntry() const { return Index == PseudoProbeFirstId; }
 
-  uint64_t getGuid() const { return Guid; }
-
-  uint64_t getIndex() const { return Index; }
+  uint32_t getIndex() const { return Index; }
 
   uint32_t getDiscriminator() const { return Discriminator; }
 
@@ -157,18 +157,20 @@ class MCPseudoProbeBase {
 /// uses an address from a temporary label created at the current address in the
 /// current section.
 class MCPseudoProbe : public MCPseudoProbeBase {
+  uint64_t Guid;
   MCSymbol *Label;
 
 public:
   MCPseudoProbe(MCSymbol *Label, uint64_t Guid, uint64_t Index, uint64_t Type,
                 uint64_t Attributes, uint32_t Discriminator)
-      : MCPseudoProbeBase(Guid, Index, Attributes, Type, Discriminator),
+      : MCPseudoProbeBase(Index, Attributes, Type, Discriminator), Guid(Guid),
         Label(Label) {
     assert(Type <= 0xFF && "Probe type too big to encode, exceeding 2^8");
     assert(Attributes <= 0xFF &&
            "Probe attributes too big to encode, exceeding 2^16");
   }
 
+  uint64_t getGuid() const { return Guid; };
   MCSymbol *getLabel() const { return Label; }
   void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
 };
@@ -181,11 +183,11 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
   MCDecodedPseudoProbeInlineTree *InlineTree;
 
 public:
-  MCDecodedPseudoProbe(uint64_t Ad, uint64_t G, uint32_t I, PseudoProbeType K,
-                       uint8_t At, uint32_t D,
-                       MCDecodedPseudoProbeInlineTree *Tree)
-      : MCPseudoProbeBase(G, I, At, static_cast<uint8_t>(K), D), Address(Ad),
+  MCDecodedPseudoProbe(uint64_t Ad, uint32_t I, PseudoProbeType K, uint8_t At,
+                       uint32_t D, MCDecodedPseudoProbeInlineTree *Tree)
+      : MCPseudoProbeBase(I, At, static_cast<uint8_t>(K), D), Address(Ad),
         InlineTree(Tree){};
+  uint64_t getGuid() const;
 
   uint64_t getAddress() const { return Address; }
 
@@ -211,21 +213,14 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
              bool ShowName) const;
 };
 
-template <typename ProbeType, typename DerivedProbeInlineTreeType>
+template <typename ProbesType, typename DerivedProbeInlineTreeType,
+          typename InlinedProbeTreeMap>
 class MCPseudoProbeInlineTreeBase {
-  struct InlineSiteHash {
-    uint64_t operator()(const InlineSite &Site) const {
-      return std::get<0>(Site) ^ std::get<1>(Site);
-    }
-  };
-
 protected:
   // Track children (e.g. inlinees) of current context
-  using InlinedProbeTreeMap = std::unordered_map<
-      InlineSite, std::unique_ptr<DerivedProbeInlineTreeType>, InlineSiteHash>;
   InlinedProbeTreeMap Children;
   // Set of probes that come with the function.
-  std::vector<ProbeType> Probes;
+  ProbesType Probes;
   MCPseudoProbeInlineTreeBase() {
     static_assert(std::is_base_of<MCPseudoProbeInlineTreeBase,
                                   DerivedProbeInlineTreeType>::value,
@@ -240,12 +235,10 @@ class MCPseudoProbeInlineTreeBase {
   bool isRoot() const { return Guid == 0; }
   InlinedProbeTreeMap &getChildren() { return Children; }
   const InlinedProbeTreeMap &getChildren() const { return Children; }
-  std::vector<ProbeType> &getProbes() { return Probes; }
-  const std::vector<ProbeType> &getProbes() const { return Probes; }
-  void addProbes(ProbeType Probe) { Probes.push_back(Probe); }
+  const ProbesType &getProbes() const { return Probes; }
   // Caller node of the inline site
-  MCPseudoProbeInlineTreeBase<ProbeType, DerivedProbeInlineTreeType> *Parent =
-      nullptr;
+  MCPseudoProbeInlineTreeBase<ProbesType, DerivedProbeInlineTreeType,
+                              InlinedProbeTreeMap> *Parent = nullptr;
   DerivedProbeInlineTreeType *getOrAddNode(const InlineSite &Site) {
     auto Ret = Children.emplace(
         Site, std::make_unique<DerivedProbeInlineTreeType>(Site));
@@ -259,9 +252,17 @@ class MCPseudoProbeInlineTreeBase {
 // instance is created as the root of a tree.
 // A real instance of this class is created for each function, either a
 // not inlined function that has code in .text section or an inlined function.
+struct InlineSiteHash {
+  uint64_t operator()(const InlineSite &Site) const {
+    return std::get<0>(Site) ^ std::get<1>(Site);
+  }
+};
 class MCPseudoProbeInlineTree
-    : public MCPseudoProbeInlineTreeBase<MCPseudoProbe,
-                                         MCPseudoProbeInlineTree> {
+    : public MCPseudoProbeInlineTreeBase<
+          std::vector<MCPseudoProbe>, MCPseudoProbeInlineTree,
+          std::unordered_map<InlineSite,
+                             std::unique_ptr<MCPseudoProbeInlineTree>,
+                             InlineSiteHash>> {
 public:
   MCPseudoProbeInlineTree() = default;
   MCPseudoProbeInlineTree(uint64_t Guid) { this->Guid = Guid; }
@@ -277,16 +278,31 @@ class MCPseudoProbeInlineTree
 
 // inline tree node for the decoded pseudo probe
 class MCDecodedPseudoProbeInlineTree
-    : public MCPseudoProbeInlineTreeBase<MCDecodedPseudoProbe *,
-                                         MCDecodedPseudoProbeInlineTree> {
-public:
-  InlineSite ISite;
+    : public MCPseudoProbeInlineTreeBase<
+          MCDecodedPseudoProbe *, MCDecodedPseudoProbeInlineTree,
+          MutableArrayRef<MCDecodedPseudoProbeInlineTree>> {
+  uint32_t NumProbes = 0;
+  uint32_t ProbeId = 0;
 
+public:
   MCDecodedPseudoProbeInlineTree() = default;
-  MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
+  MCDecodedPseudoProbeInlineTree(const InlineSite &Site,
+                                 MCDecodedPseudoProbeInlineTree *Parent)
+      : ProbeId(std::get<1>(Site)) {
+    this->Guid = std::get<0>(Site);
+    this->Parent = Parent;
+  }
 
   // Return false if it's a dummy inline site
   bool hasInlineSite() const { return !isRoot() && !Parent->isRoot(); }
+  InlineSite getInlineSite() const { return InlineSite(Guid, ProbeId); }
+  void setProbes(MutableArrayRef<MCDecodedPseudoProbe> ProbesRef) {
+    Probes = ProbesRef.data();
+    NumProbes = ProbesRef.size();
+  }
+  auto getProbes() const {
+    return MutableArrayRef<MCDecodedPseudoProbe>(Probes, NumProbes);
+  }
 };
 
 /// Instances of this class represent the pseudo probes inserted into a compile
@@ -336,6 +352,20 @@ class MCPseudoProbeTable {
 };
 
 class MCPseudoProbeDecoder {
+  // Decoded pseudo probes vector.
+  std::vector<MCDecodedPseudoProbe> PseudoProbeVec;
+  // Injected pseudo probes, identified by the containing inline tree node.
+  // Need to keep injected probes separately for two reasons:
+  // 1) Probes cannot be added to the PseudoProbeVec: appending may cause
+  //    reallocation so that pointers to its elements will become invalid.
+  // 2) Probes belonging to function record must be contiguous in PseudoProbeVec
+  //    as owning InlineTree references them with an ArrayRef to save space.
+  std::unordered_map<const MCDecodedPseudoProbeInlineTree *,
+                     std::vector<MCDecodedPseudoProbe>>
+      InjectedProbeMap;
+  // Decoded inline records vector.
+  std::vector<MCDecodedPseudoProbeInlineTree> InlineTreeVec;
+
   // GUID to PseudoProbeFuncDesc map.
   GUIDProbeFunctionMap GUID2FuncDescMap;
 
@@ -382,10 +412,6 @@ class MCPseudoProbeDecoder {
                              const Uint64Set &GuildFilter,
                              const Uint64Map &FuncStartAddrs);
 
-  bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
-                             uint64_t &LastAddr, const Uint64Set &GuildFilter,
-                             const Uint64Map &FuncStartAddrs);
-
   // Print pseudo_probe_desc section info
   void printGUID2FuncDescMap(raw_ostream &OS);
 
@@ -428,6 +454,34 @@ class MCPseudoProbeDecoder {
   const MCDecodedPseudoProbeInlineTree &getDummyInlineRoot() const {
     return DummyInlineRoot;
   }
+
+  void addInjectedProbe(const MCDecodedPseudoProbe &Probe, uint64_t Address) {
+    const MCDecodedPseudoProbeInlineTree *Parent = Probe.getInlineTreeNode();
+    InjectedProbeMap[Parent].emplace_back(Probe).setAddress(Address);
+  }
+
+  size_t
+  getNumInjectedProbes(const MCDecodedPseudoProbeInlineTree *Parent) const {
+    auto It = InjectedProbeMap.find(Parent);
+    if (It == InjectedProbeMap.end())
+      return 0;
+    return It->second.size();
+  }
+
+  auto getInjectedProbes(MCDecodedPseudoProbeInlineTree *Parent) {
+    auto It = InjectedProbeMap.find(Parent);
+    assert(It != InjectedProbeMap.end());
+    return iterator_range(It->second);
+  }
+
+private:
+  // Recursively parse an inlining tree encoded in pseudo_probe section. Returns
+  // whether the the top-level node should be skipped.
+  template <bool IsTopLevelFunc>
+  bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
+                             uint64_t &LastAddr, const Uint64Set &GuildFilter,
+                             const Uint64Map &FuncStartAddrs,
+                             const uint32_t CurChildIndex);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 21c6c4f766c30..3dc443a503f72 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -49,6 +49,8 @@ static const MCExpr *buildSymbolDiff(MCObjectStreamer *MCOS, const MCSymbol *A,
   return AddrDelta;
 }
 
+uint64_t MCDecodedPseudoProbe::getGuid() const { return InlineTree->Guid; }
+
 void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
                          const MCPseudoProbe *LastProbe) const {
   bool IsSentinel = isSentinelProbe(getAttributes());
@@ -289,8 +291,8 @@ void MCDecodedPseudoProbe::getInlineContext(
   // Note that it won't include the probe's belonging function(leaf location)
   while (Cur->hasInlineSite()) {
     StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Cur->Parent->Guid);
-    ContextStack.emplace_back(
-        MCPseudoProbeFrameLocation(FuncName, std::get<1>(Cur->ISite)));
+    ContextStack.emplace_back(MCPseudoProbeFrameLocation(
+        FuncName, std::get<1>(Cur->getInlineSite())));
     Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
   }
   // Make the ContextStack in caller-callee order
@@ -318,10 +320,10 @@ void MCDecodedPseudoProbe::print(raw_ostream &OS,
                                  bool ShowName) const {
   OS << "FUNC: ";
   if (ShowName) {
-    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Guid);
+    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, getGuid());
     OS << FuncName.str() << " ";
   } else {
-    OS << Guid << " ";
+    OS << getGuid() << " ";
   }
   OS << "Index: " << Index << "  ";
   if (Discriminator)
@@ -417,17 +419,18 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
   return true;
 }
 
+template <bool IsTopLevelFunc>
 bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     MCDecodedPseudoProbeInlineTree *Cur, uint64_t &LastAddr,
-    const Uint64Set &GuidFilter, const Uint64Map &FuncStartAddrs) {
+    const Uint64Set &GuidFilter, const Uint64Map &FuncStartAddrs,
+    const uint32_t CurChildIndex) {
   // The pseudo_probe section encodes an inline forest and each tree has a
   // format defined in MCPseudoProbe.h
 
   uint32_t Index = 0;
-  bool IsTopLevelFunc = Cur == &DummyInlineRoot;
   if (IsTopLevelFunc) {
     // Use a sequential id for top level inliner.
-    Index = Cur->getChildren().size();
+    Index = CurChildIndex;
   } else {
     // Read inline site for inlinees
     Index = cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
@@ -443,8 +446,9 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   // If the incoming node is null, all its children nodes should be disgarded.
   if (Cur) {
     // Switch/add to a new tree node(inlinee)
-    Cur = Cur->getOrAddNode(std::make_tuple(Guid, Index));
-    Cur->Guid = Guid;
+    Cur->getChildren()[CurChildIndex] =
+        MCDecodedPseudoProbeInlineTree(InlineSite(Guid, Index), Cur);
+    Cur = &Cur->getChildren()[CurChildIndex];
     if (IsTopLevelFunc && !EncodingIsAddrBased) {
       if (auto V = FuncStartAddrs.lookup(Guid))
         LastAddr = V;
@@ -454,6 +458,7 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   // Read number of probes in the current node.
   uint32_t NodeCount =
       cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
+  uint32_t CurrentProbeCount = 0;
   // Read number of direct inlinees
   uint32_t ChildrenToProcess =
       cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
@@ -494,19 +499,25 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     }
 
     if (Cur && !isSentinelProbe(Attr)) {
-      // Populate Address2ProbesMap
-      auto &Probes = Address2ProbesMap[Addr];
-      Probes.emplace_back(Addr, Cur->Guid, Index, PseudoProbeType(Kind), Attr,
-                          Discriminator, Cur);
-      Cur->addProbes(&Probes.back());
+      PseudoProbeVec.emplace_back(Addr, Index, PseudoProbeType(Kind), Attr,
+                                  Discriminator, Cur);
+      Address2ProbesMap[Addr].emplace_back(PseudoProbeVec.back());
+      ++CurrentProbeCount;
     }
     LastAddr = Addr;
   }
 
+  if (Cur) {
+    Cur->setProbes(
+        MutableArrayRef(PseudoProbeVec).take_back(CurrentProbeCount));
+    InlineTreeVec.resize(InlineTreeVec.size() + ChildrenToProcess);
+    Cur->getChildren() =
+        MutableArrayRef(InlineTreeVec).take_back(ChildrenToProcess);
+  }
   for (uint32_t I = 0; I < ChildrenToProcess; I++) {
-    buildAddress2ProbeMap(Cur, LastAddr, GuidFilter, FuncStartAddrs);
+    buildAddress2ProbeMap<false>(Cur, LastAddr, GuidFilter, FuncStartAddrs, I);
   }
-  return true;
+  return Cur;
 }
 
 template <bool IsTopLevelFunc>
@@ -605,14 +616,25 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     TopLevelFuncs += !Discard;
   }
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  PseudoProbeVec.reserve(ProbeCount);
+  InlineTreeVec.reserve(InlinedCount);
+
+  // Allocate top-level function records as children of DummyInlineRoot.
+  InlineTreeVec.resize(TopLevelFuncs);
+  DummyInlineRoot.getChildren() = MutableArrayRef(InlineTreeVec);
 
   Data = Start;
   End = Data + Size;
   uint64_t LastAddr = 0;
+  uint32_t CurChildIndex = 0;
   while (Data < End)
-    buildAddress2ProbeMap(&DummyInlineRoot, LastAddr, GuidFilter,
-                          FuncStartAddrs);
+    CurChildIndex += buildAddress2ProbeMap<true>(
+        &DummyInlineRoot, LastAddr, GuidFilter, FuncStartAddrs, CurChildIndex);
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  assert(PseudoProbeVec.size() == ProbeCount &&
+         "Mismatching probe count pre- and post-parsing");
+  assert(InlineTreeVec.size() == InlinedCount &&
+         "Mismatching function records count pre- and post-parsing");
   return true;
 }
 
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 175556c2220e6..2c6875281047d 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -1302,9 +1302,9 @@ void CSProfileGenerator::populateBodySamplesWithProbes(
   // and will be inferred by the compiler.
   for (auto &I : FrameSamples) {
     for (auto *FunctionProfile : I.second) {
-      for (auto *Probe : I.first->getProbes()) {
-        FunctionProfile->addBodySamples(Probe->getIndex(),
-                                        Probe->getDiscriminator(), 0);
+      for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
+        FunctionProfile->addBodySamples(Probe.getIndex(),
+                                        Probe.getDiscriminator(), 0);
       }
     }
   }
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 574a9c9f52bf1..fe7d3ffa476eb 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -132,7 +132,7 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
     MCPseudoProbeDecoder &ProbeDecoder) {
   ProbeFrameStack ProbeContext;
   for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren())
-    trackInlineesOptimizedAway(ProbeDecoder, *Child.second, ProbeContext);
+    trackInlineesOptimizedAway(ProbeDecoder, Child, ProbeContext);
 }
 
 void BinarySizeContextTracker::trackInlineesOptimizedAway(
@@ -160,9 +160,9 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
 
   // DFS down the probe inline tree
   for (const auto &ChildNode : ProbeNode.getChildren()) {
-    InlineSite Location = ChildNode.first;
+    InlineSite Location = ChildNode.getInlineSite();
     ProbeContext.back().second = std::get<1>(Location);
-    trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second, ProbeContext);
+    trackInlineesOptimizedAway(ProbeDecoder, ChildNode, ProbeContext);
   }
 
   ProbeContext.pop_back();
@@ -454,8 +454,8 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
   // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe
   // is available
   if (TrackFuncContextSize) {
-    for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
-      auto *Frame = Child.second.get();
+    for (auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
+      auto *Frame = &Child;
       StringRef FuncName =
           ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName;
       TopLevelProbeFrameMap[FuncName] = Frame;

From 9746055b0a1ae1e7c6aff50fc217dc216605c277 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Sat, 10 Aug 2024 23:48:40 -0700
Subject: [PATCH 32/42] [MC][profgen][NFC] Expand auto for MCDecodedPseudoProbe

Expand autos in select places in preparation to #102789.

Reviewers: dcci, maksfb, WenleiHe, rafaelauler, ayermolo, wlei-llvm

Reviewed By: WenleiHe, wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102788
---
 llvm/lib/MC/MCPseudoProbe.cpp                | 4 ++--
 llvm/tools/llvm-profgen/ProfileGenerator.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 3dc443a503f72..1031dac331bb1 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -652,7 +652,7 @@ void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
                                                 uint64_t Address) {
   auto It = Address2ProbesMap.find(Address);
   if (It != Address2ProbesMap.end()) {
-    for (auto &Probe : It->second) {
+    for (const MCDecodedPseudoProbe &Probe : It->second) {
       OS << " [Probe]:\t";
       Probe.print(OS, GUID2FuncDescMap, true);
     }
@@ -679,7 +679,7 @@ MCPseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
   const auto &Probes = It->second;
 
   const MCDecodedPseudoProbe *CallProbe = nullptr;
-  for (const auto &Probe : Probes) {
+  for (const MCDecodedPseudoProbe &Probe : Probes) {
     if (Probe.isCall()) {
       // Disabling the assert and returning first call probe seen so far.
       // Subsequent call probes, if any, are ignored. Due to the the way
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 2c6875281047d..d9283271b03c0 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -1194,7 +1194,7 @@ void ProfileGeneratorBase::extractProbesFromRange(
           Binary->getAddress2ProbesMap();
       auto It = Address2ProbesMap.find(IP.Address);
       if (It != Address2ProbesMap.end()) {
-        for (const auto &Probe : It->second) {
+        for (const MCDecodedPseudoProbe &Probe : It->second) {
           ProbeCounter[&Probe] += Count;
         }
       }

From 3dcef4813afc966aa7bb73d733556c369d3a8011 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 26 Aug 2024 09:14:35 -0700
Subject: [PATCH 33/42] [MC][NFC] Reduce Address2ProbesMap size

Replace the map from addresses to list of probes with a flat vector
containing probe references sorted by their addresses.

Reduces pseudo probe parsing time from 9.56s to 8.59s and peak RSS from
9.66 GiB to 9.08 GiB as part of perf2bolt processing a large binary.

Test Plan:
```
bin/llvm-lit -sv test/tools/llvm-profgen
```

Reviewers: maksfb, rafaelauler, dcci, ayermolo, wlei-llvm

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102904
---
 bolt/lib/Profile/DataAggregator.cpp          | 14 ++--
 bolt/lib/Profile/YAMLProfileWriter.cpp       | 11 +--
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp     | 83 ++++++++------------
 llvm/include/llvm/MC/MCPseudoProbe.h         | 30 +++++--
 llvm/lib/MC/MCPseudoProbe.cpp                | 43 +++++-----
 llvm/tools/llvm-profgen/ProfileGenerator.cpp |  8 +-
 6 files changed, 94 insertions(+), 95 deletions(-)

diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index a300e5b2b1dab..813d825f8b570 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -2415,17 +2415,15 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         Fragments.insert(BF);
         for (const BinaryFunction *F : Fragments) {
           const uint64_t FuncAddr = F->getAddress();
-          const auto &FragmentProbes =
-              llvm::make_range(ProbeMap.lower_bound(FuncAddr),
-                               ProbeMap.lower_bound(FuncAddr + F->getSize()));
-          for (const auto &[OutputAddress, Probes] : FragmentProbes) {
+          for (const MCDecodedPseudoProbe &Probe :
+               ProbeMap.find(FuncAddr, FuncAddr + F->getSize())) {
+            const uint32_t OutputAddress = Probe.getAddress();
             const uint32_t InputOffset = BAT->translate(
                 FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true);
             const unsigned BlockIndex = getBlock(InputOffset).second;
-            for (const MCDecodedPseudoProbe &Probe : Probes)
-              YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
-                  yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
-                                              Probe.getType()});
+            YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
+                yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
+                                            Probe.getType()});
           }
         }
       }
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 84777741d611a..f74cf60e076d0 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -193,13 +193,10 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
       const uint64_t FuncAddr = BF.getAddress();
       const std::pair<uint64_t, uint64_t> &BlockRange =
           BB->getInputAddressRange();
-      const auto &BlockProbes =
-          llvm::make_range(ProbeMap.lower_bound(FuncAddr + BlockRange.first),
-                           ProbeMap.lower_bound(FuncAddr + BlockRange.second));
-      for (const auto &[_, Probes] : BlockProbes)
-        for (const MCDecodedPseudoProbe &Probe : Probes)
-          YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
-              Probe.getGuid(), Probe.getIndex(), Probe.getType()});
+      for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(
+               FuncAddr + BlockRange.first, FuncAddr + BlockRange.second))
+        YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
+            Probe.getGuid(), Probe.getIndex(), Probe.getType()});
     }
 
     YamlBF.Blocks.emplace_back(YamlBB);
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 9677530919b90..7516918b2389f 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -173,13 +173,13 @@ void PseudoProbeRewriter::updatePseudoProbes() {
   AddressProbesMap &Address2ProbesMap = ProbeDecoder.getAddress2ProbesMap();
   const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap();
 
-  for (auto &AP : Address2ProbesMap) {
-    BinaryFunction *F = BC.getBinaryFunctionContainingAddress(AP.first);
+  for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+    uint64_t Address = Probe.getAddress();
+    BinaryFunction *F = BC.getBinaryFunctionContainingAddress(Address);
     // If F is removed, eliminate all probes inside it from inline tree
     // Setting probes' addresses as INT64_MAX means elimination
     if (!F) {
-      for (MCDecodedPseudoProbe &Probe : AP.second)
-        Probe.setAddress(INT64_MAX);
+      Probe.setAddress(INT64_MAX);
       continue;
     }
     // If F is not emitted, the function will remain in the same address as its
@@ -187,45 +187,36 @@ void PseudoProbeRewriter::updatePseudoProbes() {
     if (!F->isEmitted())
       continue;
 
-    uint64_t Offset = AP.first - F->getAddress();
+    uint64_t Offset = Address - F->getAddress();
     const BinaryBasicBlock *BB = F->getBasicBlockContainingOffset(Offset);
     uint64_t BlkOutputAddress = BB->getOutputAddressRange().first;
     // Check if block output address is defined.
     // If not, such block is removed from binary. Then remove the probes from
     // inline tree
     if (BlkOutputAddress == 0) {
-      for (MCDecodedPseudoProbe &Probe : AP.second)
-        Probe.setAddress(INT64_MAX);
+      Probe.setAddress(INT64_MAX);
       continue;
     }
 
-    unsigned ProbeTrack = AP.second.size();
-    auto Probe = llvm::map_iterator(
-        AP.second.begin(),
-        [](auto RW) -> MCDecodedPseudoProbe & { return RW.get(); });
-    while (ProbeTrack != 0) {
-      if (Probe->isBlock()) {
-        Probe->setAddress(BlkOutputAddress);
-      } else if (Probe->isCall()) {
-        // A call probe may be duplicated due to ICP
-        // Go through output of InputOffsetToAddressMap to collect all related
-        // probes
-        auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(AP.first);
-        auto CallOutputAddress = CallOutputAddresses.first;
-        if (CallOutputAddress == CallOutputAddresses.second) {
-          Probe->setAddress(INT64_MAX);
-        } else {
-          Probe->setAddress(CallOutputAddress->second);
-          CallOutputAddress = std::next(CallOutputAddress);
-        }
-
-        while (CallOutputAddress != CallOutputAddresses.second) {
-          ProbeDecoder.addInjectedProbe(*Probe, CallOutputAddress->second);
-          CallOutputAddress = std::next(CallOutputAddress);
-        }
+    if (Probe.isBlock()) {
+      Probe.setAddress(BlkOutputAddress);
+    } else if (Probe.isCall()) {
+      // A call probe may be duplicated due to ICP
+      // Go through output of InputOffsetToAddressMap to collect all related
+      // probes
+      auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(Address);
+      auto CallOutputAddress = CallOutputAddresses.first;
+      if (CallOutputAddress == CallOutputAddresses.second) {
+        Probe.setAddress(INT64_MAX);
+      } else {
+        Probe.setAddress(CallOutputAddress->second);
+        CallOutputAddress = std::next(CallOutputAddress);
+      }
+
+      while (CallOutputAddress != CallOutputAddresses.second) {
+        ProbeDecoder.addInjectedProbe(Probe, CallOutputAddress->second);
+        CallOutputAddress = std::next(CallOutputAddress);
       }
-      Probe = std::next(Probe);
-      ProbeTrack--;
     }
   }
 
@@ -241,22 +232,16 @@ void PseudoProbeRewriter::updatePseudoProbes() {
             BinaryBlock.getName();
 
     // scan all addresses -> correlate probe to block when print out
-    std::vector<uint64_t> Addresses;
-    for (auto &Entry : Address2ProbesMap)
-      Addresses.push_back(Entry.first);
-    llvm::sort(Addresses);
-    for (uint64_t Key : Addresses) {
-      for (MCDecodedPseudoProbe &Probe : Address2ProbesMap[Key]) {
-        if (Probe.getAddress() == INT64_MAX)
-          outs() << "Deleted Probe: ";
-        else
-          outs() << "Address: " << format_hex(Probe.getAddress(), 8) << " ";
-        Probe.print(outs(), GUID2Func, true);
-        // print block name only if the probe is block type and undeleted.
-        if (Probe.isBlock() && Probe.getAddress() != INT64_MAX)
-          outs() << format_hex(Probe.getAddress(), 8) << " Probe is in "
-                 << Addr2BlockNames[Probe.getAddress()] << "\n";
-      }
+    for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+      if (Probe.getAddress() == INT64_MAX)
+        outs() << "Deleted Probe: ";
+      else
+        outs() << "Address: " << format_hex(Probe.getAddress(), 8) << " ";
+      Probe.print(outs(), GUID2Func, true);
+      // print block name only if the probe is block type and undeleted.
+      if (Probe.isBlock() && Probe.getAddress() != INT64_MAX)
+        outs() << format_hex(Probe.getAddress(), 8) << " Probe is in "
+               << Addr2BlockNames[Probe.getAddress()] << "\n";
     }
     outs() << "=======================================\n";
   }
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 66ad9db4860d8..854f1209c3934 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -63,7 +63,6 @@
 #include "llvm/IR/PseudoProbe.h"
 #include "llvm/Support/ErrorOr.h"
 #include <functional>
-#include <map>
 #include <memory>
 #include <string>
 #include <tuple>
@@ -103,10 +102,6 @@ using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 // GUID to PseudoProbeFuncDesc map
 using GUIDProbeFunctionMap =
     std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
-// Address to pseudo probes map.
-using AddressProbesMap =
-    std::map<uint64_t,
-             std::vector<std::reference_wrapper<MCDecodedPseudoProbe>>>;
 
 class MCDecodedPseudoProbeInlineTree;
 
@@ -213,6 +208,31 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
              bool ShowName) const;
 };
 
+// Address to pseudo probes map.
+class AddressProbesMap
+    : public std::vector<std::reference_wrapper<MCDecodedPseudoProbe>> {
+  auto getIt(uint64_t Addr) const {
+    auto CompareProbe = [](const MCDecodedPseudoProbe &Probe, uint64_t Addr) {
+      return Probe.getAddress() < Addr;
+    };
+    return llvm::lower_bound(*this, Addr, CompareProbe);
+  }
+
+public:
+  // Returns range of probes within [\p From, \p To) address range.
+  auto find(uint64_t From, uint64_t To) const {
+    return llvm::make_range(getIt(From), getIt(To));
+  }
+  // Returns range of probes with given \p Address.
+  auto find(uint64_t Address) const {
+    auto FromIt = getIt(Address);
+    if (FromIt == end() || FromIt->get().getAddress() != Address)
+      return llvm::make_range(end(), end());
+    auto ToIt = getIt(Address + 1);
+    return llvm::make_range(FromIt, ToIt);
+  }
+};
+
 template <typename ProbesType, typename DerivedProbeInlineTreeType,
           typename InlinedProbeTreeMap>
 class MCPseudoProbeInlineTreeBase {
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 1031dac331bb1..5951499c0cb28 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -501,7 +501,6 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     if (Cur && !isSentinelProbe(Attr)) {
       PseudoProbeVec.emplace_back(Addr, Index, PseudoProbeType(Kind), Attr,
                                   Discriminator, Cur);
-      Address2ProbesMap[Addr].emplace_back(PseudoProbeVec.back());
       ++CurrentProbeCount;
     }
     LastAddr = Addr;
@@ -635,6 +634,15 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
          "Mismatching probe count pre- and post-parsing");
   assert(InlineTreeVec.size() == InlinedCount &&
          "Mismatching function records count pre- and post-parsing");
+
+  std::vector<std::pair<uint64_t, uint32_t>> SortedA2P(ProbeCount);
+  for (const auto &[I, Probe] : llvm::enumerate(PseudoProbeVec))
+    SortedA2P[I] = {Probe.getAddress(), I};
+  llvm::sort(SortedA2P);
+  Address2ProbesMap.reserve(ProbeCount);
+  for (const uint32_t I : llvm::make_second_range(SortedA2P))
+    Address2ProbesMap.emplace_back(PseudoProbeVec[I]);
+  SortedA2P.clear();
   return true;
 }
 
@@ -650,36 +658,29 @@ void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
 
 void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
                                                 uint64_t Address) {
-  auto It = Address2ProbesMap.find(Address);
-  if (It != Address2ProbesMap.end()) {
-    for (const MCDecodedPseudoProbe &Probe : It->second) {
-      OS << " [Probe]:\t";
-      Probe.print(OS, GUID2FuncDescMap, true);
-    }
+  for (const MCDecodedPseudoProbe &Probe : Address2ProbesMap.find(Address)) {
+    OS << " [Probe]:\t";
+    Probe.print(OS, GUID2FuncDescMap, true);
   }
 }
 
 void MCPseudoProbeDecoder::printProbesForAllAddresses(raw_ostream &OS) {
-  auto Entries = make_first_range(Address2ProbesMap);
-  SmallVector<uint64_t, 0> Addresses(Entries.begin(), Entries.end());
-  llvm::sort(Addresses);
-  for (auto K : Addresses) {
-    OS << "Address:\t";
-    OS << K;
-    OS << "\n";
-    printProbeForAddress(OS, K);
+  uint64_t PrevAddress = INT64_MAX;
+  for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+    uint64_t Address = Probe.getAddress();
+    if (Address != PrevAddress) {
+      PrevAddress = Address;
+      OS << "Address:\t" << Address << '\n';
+    }
+    OS << " [Probe]:\t";
+    Probe.print(OS, GUID2FuncDescMap, true);
   }
 }
 
 const MCDecodedPseudoProbe *
 MCPseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
-  auto It = Address2ProbesMap.find(Address);
-  if (It == Address2ProbesMap.end())
-    return nullptr;
-  const auto &Probes = It->second;
-
   const MCDecodedPseudoProbe *CallProbe = nullptr;
-  for (const MCDecodedPseudoProbe &Probe : Probes) {
+  for (const MCDecodedPseudoProbe &Probe : Address2ProbesMap.find(Address)) {
     if (Probe.isCall()) {
       // Disabling the assert and returning first call probe seen so far.
       // Subsequent call probes, if any, are ignored. Due to the the way
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index d9283271b03c0..3b12f536be55d 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -1192,11 +1192,9 @@ void ProfileGeneratorBase::extractProbesFromRange(
     do {
       const AddressProbesMap &Address2ProbesMap =
           Binary->getAddress2ProbesMap();
-      auto It = Address2ProbesMap.find(IP.Address);
-      if (It != Address2ProbesMap.end()) {
-        for (const MCDecodedPseudoProbe &Probe : It->second) {
-          ProbeCounter[&Probe] += Count;
-        }
+      for (const MCDecodedPseudoProbe &Probe :
+           Address2ProbesMap.find(IP.Address)) {
+        ProbeCounter[&Probe] += Count;
       }
     } while (IP.advance() && IP.Address <= RangeEnd);
   }

From ba149d99c8dc1d813226b660f3d14b5d879a721c Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 26 Aug 2024 09:15:53 -0700
Subject: [PATCH 34/42] [MC][NFC] Use vector for GUIDProbeFunctionMap

Replace unordered_map with a vector. Pre-parse the section to statically
allocate storage. Use BumpPtrAllocator for FuncName strings, keep
StringRef in FuncDesc.

Reduces peak RSS of pseudo probe parsing from 9.08 GiB to 8.89 GiB as
part of perf2bolt with a large binary.

Test Plan:
```
bin/llvm-lit -sv test/tools/llvm-profgen
```

Reviewers: wlei-llvm, rafaelauler, dcci, maksfb, ayermolo

Reviewed By: wlei-llvm

Pull Request: https://github.com/llvm/llvm-project/pull/102905
---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp |  3 +-
 llvm/include/llvm/MC/MCPseudoProbe.h     | 19 +++++++--
 llvm/lib/MC/MCPseudoProbe.cpp            | 52 ++++++++++++++----------
 3 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 7516918b2389f..4925b4b385d9b 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -155,7 +155,8 @@ void PseudoProbeRewriter::parsePseudoProbe() {
     ProbeDecoder.printProbesForAllAddresses(outs());
   }
 
-  for (const auto &[GUID, FuncDesc] : ProbeDecoder.getGUID2FuncDescMap()) {
+  for (const auto &FuncDesc : ProbeDecoder.getGUID2FuncDescMap()) {
+    uint64_t GUID = FuncDesc.FuncGUID;
     if (!FuncStartAddrs.contains(GUID))
       continue;
     BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncStartAddrs[GUID]);
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 854f1209c3934..32905c1e9a424 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -61,6 +61,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/IR/PseudoProbe.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/ErrorOr.h"
 #include <functional>
 #include <memory>
@@ -86,7 +87,7 @@ enum class MCPseudoProbeFlag {
 struct MCPseudoProbeFuncDesc {
   uint64_t FuncGUID = 0;
   uint64_t FuncHash = 0;
-  std::string FuncName;
+  StringRef FuncName;
 
   MCPseudoProbeFuncDesc(uint64_t GUID, uint64_t Hash, StringRef Name)
       : FuncGUID(GUID), FuncHash(Hash), FuncName(Name){};
@@ -100,8 +101,18 @@ class MCDecodedPseudoProbe;
 using InlineSite = std::tuple<uint64_t, uint32_t>;
 using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 // GUID to PseudoProbeFuncDesc map
-using GUIDProbeFunctionMap =
-    std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
+class GUIDProbeFunctionMap : public std::vector<MCPseudoProbeFuncDesc> {
+public:
+  auto find(uint64_t GUID) const {
+    auto CompareDesc = [](const MCPseudoProbeFuncDesc &Desc, uint64_t GUID) {
+      return Desc.FuncGUID < GUID;
+    };
+    auto It = llvm::lower_bound(*this, GUID, CompareDesc);
+    if (It->FuncGUID != GUID)
+      return end();
+    return It;
+  }
+};
 
 class MCDecodedPseudoProbeInlineTree;
 
@@ -389,6 +400,8 @@ class MCPseudoProbeDecoder {
   // GUID to PseudoProbeFuncDesc map.
   GUIDProbeFunctionMap GUID2FuncDescMap;
 
+  BumpPtrAllocator FuncNameAllocator;
+
   // Address to probes map.
   AddressProbesMap Address2ProbesMap;
 
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 5951499c0cb28..90d7588407068 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -274,7 +274,7 @@ static StringRef getProbeFNameForGUID(const GUIDProbeFunctionMap &GUID2FuncMAP,
   auto It = GUID2FuncMAP.find(GUID);
   assert(It != GUID2FuncMAP.end() &&
          "Probe function must exist for a valid GUID");
-  return It->second.FuncName;
+  return It->FuncName;
 }
 
 void MCPseudoProbeFuncDesc::print(raw_ostream &OS) {
@@ -390,32 +390,46 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
   Data = Start;
   End = Data + Size;
 
+  uint32_t FuncDescCount = 0;
   while (Data < End) {
-    auto ErrorOrGUID = readUnencodedNumber<uint64_t>();
-    if (!ErrorOrGUID)
+    // GUID
+    if (!readUnencodedNumber<uint64_t>())
       return false;
-
-    auto ErrorOrHash = readUnencodedNumber<uint64_t>();
-    if (!ErrorOrHash)
+    // Hash
+    if (!readUnencodedNumber<uint64_t>())
       return false;
 
     auto ErrorOrNameSize = readUnsignedNumber<uint32_t>();
     if (!ErrorOrNameSize)
       return false;
-    uint32_t NameSize = std::move(*ErrorOrNameSize);
-
-    auto ErrorOrName = readString(NameSize);
-    if (!ErrorOrName)
+    // Function name
+    if (!readString(*ErrorOrNameSize))
       return false;
+    ++FuncDescCount;
+  }
+  assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
+  GUID2FuncDescMap.reserve(FuncDescCount);
 
-    uint64_t GUID = std::move(*ErrorOrGUID);
-    uint64_t Hash = std::move(*ErrorOrHash);
-    StringRef Name = std::move(*ErrorOrName);
+  Data = Start;
+  End = Data + Size;
+  while (Data < End) {
+    uint64_t GUID =
+        cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
+    uint64_t Hash =
+        cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
+    uint32_t NameSize =
+        cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
+    StringRef Name = cantFail(errorOrToExpected(readString(NameSize)));
 
     // Initialize PseudoProbeFuncDesc and populate it into GUID2FuncDescMap
-    GUID2FuncDescMap.emplace(GUID, MCPseudoProbeFuncDesc(GUID, Hash, Name));
+    GUID2FuncDescMap.emplace_back(GUID, Hash, Name.copy(FuncNameAllocator));
   }
   assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
+  assert(GUID2FuncDescMap.size() == FuncDescCount &&
+         "Mismatching function description count pre- and post-parsing");
+  llvm::sort(GUID2FuncDescMap, [](const auto &LHS, const auto &RHS) {
+    return LHS.FuncGUID < RHS.FuncGUID;
+  });
   return true;
 }
 
@@ -648,12 +662,8 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
 
 void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
   OS << "Pseudo Probe Desc:\n";
-  // Make the output deterministic
-  std::map<uint64_t, MCPseudoProbeFuncDesc> OrderedMap(GUID2FuncDescMap.begin(),
-                                                       GUID2FuncDescMap.end());
-  for (auto &I : OrderedMap) {
-    I.second.print(OS);
-  }
+  for (auto &I : GUID2FuncDescMap)
+    I.print(OS);
 }
 
 void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
@@ -705,7 +715,7 @@ const MCPseudoProbeFuncDesc *
 MCPseudoProbeDecoder::getFuncDescForGUID(uint64_t GUID) const {
   auto It = GUID2FuncDescMap.find(GUID);
   assert(It != GUID2FuncDescMap.end() && "Function descriptor doesn't exist");
-  return &It->second;
+  return &*It;
 }
 
 void MCPseudoProbeDecoder::getInlineContextForProbe(

From c35e8acd11e67fb9a5cd0a66ae51066f24df524a Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 30 Jul 2024 11:24:24 -0700
Subject: [PATCH 35/42] buildAddress2ProbeMap timers

---
 llvm/lib/MC/MCPseudoProbe.cpp | 50 +++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 90d7588407068..af7fe7edff1e7 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -19,8 +19,10 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MD5.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -376,6 +378,8 @@ ErrorOr<StringRef> MCPseudoProbeDecoder::readString(uint32_t Size) {
 
 bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
                                                  std::size_t Size) {
+  Timer T("buildGUID2FDMap", "build GUID to FuncDesc map");
+  T.startTimer();
   // The pseudo_probe_desc section has a format like:
   // .section .pseudo_probe_desc,"",@progbits
   // .quad -5182264717993193164   // GUID
@@ -430,6 +434,12 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
   llvm::sort(GUID2FuncDescMap, [](const auto &LHS, const auto &RHS) {
     return LHS.FuncGUID < RHS.FuncGUID;
   });
+  T.stopTimer();
+  auto TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "func desc ";
+  TT.print(TT, dbgs());
+  dbgs() << '\n';
   return true;
 }
 
@@ -623,12 +633,20 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   Data = Start;
   End = Data + Size;
   bool Discard = false;
+  Timer T("countRecords", "pre-parsing function records");
+  T.startTimer();
   while (Data < End) {
     if (!countRecords<true>(Discard, ProbeCount, InlinedCount, GuidFilter))
       return false;
     TopLevelFuncs += !Discard;
   }
+  T.stopTimer();
+  auto TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "pre-parsing ";
+  TT.print(TT, dbgs());
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  T.startTimer();
   PseudoProbeVec.reserve(ProbeCount);
   InlineTreeVec.reserve(InlinedCount);
 
@@ -636,6 +654,13 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   InlineTreeVec.resize(TopLevelFuncs);
   DummyInlineRoot.getChildren() = MutableArrayRef(InlineTreeVec);
 
+  T.stopTimer();
+  TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "\nalloc ";
+  TT.print(TT, dbgs());
+
+  T.startTimer();
   Data = Start;
   End = Data + Size;
   uint64_t LastAddr = 0;
@@ -643,12 +668,18 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   while (Data < End)
     CurChildIndex += buildAddress2ProbeMap<true>(
         &DummyInlineRoot, LastAddr, GuidFilter, FuncStartAddrs, CurChildIndex);
+  T.stopTimer();
+  TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "\nparsing ";
+  TT.print(TT, dbgs());
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
   assert(PseudoProbeVec.size() == ProbeCount &&
          "Mismatching probe count pre- and post-parsing");
   assert(InlineTreeVec.size() == InlinedCount &&
          "Mismatching function records count pre- and post-parsing");
 
+  T.startTimer();
   std::vector<std::pair<uint64_t, uint32_t>> SortedA2P(ProbeCount);
   for (const auto &[I, Probe] : llvm::enumerate(PseudoProbeVec))
     SortedA2P[I] = {Probe.getAddress(), I};
@@ -657,6 +688,25 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   for (const uint32_t I : llvm::make_second_range(SortedA2P))
     Address2ProbesMap.emplace_back(PseudoProbeVec[I]);
   SortedA2P.clear();
+  T.stopTimer();
+  TT = T.getTotalTime();
+  T.clear();
+  dbgs() << "\nsorting ";
+  TT.print(TT, dbgs());
+  dbgs() << '\n';
+  size_t PPVecSize = 32 * PseudoProbeVec.capacity();
+  size_t ITVecSize = 48 * InlineTreeVec.capacity();
+  size_t G2FDMapSize = 32 * GUID2FuncDescMap.capacity();
+  size_t StringSize = FuncNameAllocator.getBytesAllocated();
+  size_t A2PSize = 8 * Address2ProbesMap.capacity();
+  dbgs() << formatv("PPVec size: {0} GiB\n", 1.f * PPVecSize / (1 << 30))
+         << formatv("ITVec size: {0} GiB\n", 1.f * ITVecSize / (1 << 30))
+         << formatv("G2FDMap size: {0} GiB\n", 1.f * G2FDMapSize / (1 << 30))
+         << formatv("  (strings {0} GiB)\n", 1.f * StringSize / (1 << 30))
+         << formatv("A2P size: {0} GiB\n", 1.f * A2PSize / (1 << 30))
+         << formatv("Total size: {0} GiB\n",
+                    1.f * (PPVecSize + ITVecSize + G2FDMapSize + A2PSize) /
+                        (1 << 30));
   return true;
 }
 

From 1c469cf2dd59241b65e07c5f1030af1d371d881b Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 27 Aug 2024 14:44:39 -0700
Subject: [PATCH 36/42] [BOLT][NFC] Rename profile-use-pseudo-probes

---
 bolt/lib/Profile/DataAggregator.cpp            |  4 ++--
 bolt/lib/Profile/YAMLProfileReader.cpp         |  5 -----
 bolt/lib/Profile/YAMLProfileWriter.cpp         | 11 ++++++++---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp       |  6 +++---
 bolt/test/X86/pseudoprobe-decoding-inline.test |  6 +++---
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 813d825f8b570..10d745cc69824 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -88,7 +88,7 @@ MaxSamples("max-samples",
   cl::cat(AggregatorCategory));
 
 extern cl::opt<opts::ProfileFormatKind> ProfileFormat;
-extern cl::opt<bool> ProfileUsePseudoProbes;
+extern cl::opt<bool> ProfileWritePseudoProbes;
 extern cl::opt<std::string> SaveProfile;
 
 cl::opt<bool> ReadPreAggregated(
@@ -2300,7 +2300,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
   yaml::bolt::BinaryProfile BP;
 
   const MCPseudoProbeDecoder *PseudoProbeDecoder =
-      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+      opts::ProfileWritePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
 
   // Fill out the header info.
   BP.Header.Version = 1;
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 3eca5e972fa5b..604a9fb4813be 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -49,11 +49,6 @@ llvm::cl::opt<bool>
 llvm::cl::opt<bool> ProfileUseDFS("profile-use-dfs",
                                   cl::desc("use DFS order for YAML profile"),
                                   cl::Hidden, cl::cat(BoltOptCategory));
-
-llvm::cl::opt<bool> ProfileUsePseudoProbes(
-    "profile-use-pseudo-probes",
-    cl::desc("Use pseudo probes for profile generation and matching"),
-    cl::Hidden, cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index f74cf60e076d0..ffbf2388e912f 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -13,6 +13,7 @@
 #include "bolt/Profile/DataAggregator.h"
 #include "bolt/Profile/ProfileReaderBase.h"
 #include "bolt/Rewrite/RewriteInstance.h"
+#include "bolt/Utils/CommandLineOpts.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -21,8 +22,12 @@
 #define DEBUG_TYPE "bolt-prof"
 
 namespace opts {
-extern llvm::cl::opt<bool> ProfileUseDFS;
-extern llvm::cl::opt<bool> ProfileUsePseudoProbes;
+using namespace llvm;
+extern cl::opt<bool> ProfileUseDFS;
+cl::opt<bool> ProfileWritePseudoProbes(
+    "profile-write-pseudo-probes",
+    cl::desc("Use pseudo probes in profile generation"), cl::Hidden,
+    cl::cat(BoltOptCategory));
 } // namespace opts
 
 namespace llvm {
@@ -59,7 +64,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   yaml::bolt::BinaryFunctionProfile YamlBF;
   const BinaryContext &BC = BF.getBinaryContext();
   const MCPseudoProbeDecoder *PseudoProbeDecoder =
-      opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
+      opts::ProfileWritePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr;
 
   const uint16_t LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR;
 
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 4925b4b385d9b..fef721167869d 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -49,7 +49,7 @@ static cl::opt<PrintPseudoProbesOptions> PrintPseudoProbes(
                clEnumValN(PPP_All, "all", "enable all debugging printout")),
     cl::Hidden, cl::cat(BoltCategory));
 
-extern cl::opt<bool> ProfileUsePseudoProbes;
+extern cl::opt<bool> ProfileWritePseudoProbes;
 } // namespace opts
 
 namespace {
@@ -90,14 +90,14 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 };
 
 Error PseudoProbeRewriter::preCFGInitializer() {
-  if (opts::ProfileUsePseudoProbes)
+  if (opts::ProfileWritePseudoProbes)
     parsePseudoProbe();
 
   return Error::success();
 }
 
 Error PseudoProbeRewriter::postEmitFinalizer() {
-  if (!opts::ProfileUsePseudoProbes)
+  if (!opts::ProfileWritePseudoProbes)
     parsePseudoProbe();
   updatePseudoProbes();
 
diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test
index b361551e5711e..1fdd00c7ef6c4 100644
--- a/bolt/test/X86/pseudoprobe-decoding-inline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-inline.test
@@ -6,11 +6,11 @@
 # PREAGG: B X:0 #main# 1 0
 ## Check pseudo-probes in regular YAML profile (non-BOLTed binary)
 # RUN: link_fdata %s %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin %t.preagg PREAGG
-# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata --profile-use-pseudo-probes
+# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata --profile-write-pseudo-probes
 # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-YAML
 ## Check pseudo-probes in BAT YAML profile (BOLTed binary)
 # RUN: link_fdata %s %t.bolt %t.preagg2 PREAGG
-# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2 --profile-use-pseudo-probes
+# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2 --profile-write-pseudo-probes
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
@@ -30,7 +30,7 @@
 # CHECK-YAML: guid: 0xDB956436E78DD5FA
 # CHECK-YAML: pseudo_probe_desc_hash: 0x10000FFFFFFFF
 #
-## Check that without --profile-use-pseudo-probes option, no pseudo probes are
+## Check that without --profile-write-pseudo-probes option, no pseudo probes are
 ## generated
 # RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata
 # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-NO-OPT

From 97f81017f04f23a5ec209e89b3800a34868c7c9a Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 27 Aug 2024 09:26:44 -0700
Subject: [PATCH 37/42] [BOLT][NFCI] Strip suffix in getLTOCommonName

Also provide a mechanism to override the list of suffixes to consider.
Override LTOSuffixes for getGUID in pseudo probe parsing.

Test Plan:

Reviewers:
Subscribers:

Tasks:

Tags:

Differential Revision: https://phabricator.intern.facebook.com/D61857819

Pull Request: https://github.com/llvm/llvm-project/pull/106243
---
 bolt/include/bolt/Utils/Utils.h          |  5 ++++
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 37 ++++++++++++++++++------
 bolt/lib/Utils/Utils.cpp                 | 12 ++++++--
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/bolt/include/bolt/Utils/Utils.h b/bolt/include/bolt/Utils/Utils.h
index 3886c5f8757c0..9baee7d94066d 100644
--- a/bolt/include/bolt/Utils/Utils.h
+++ b/bolt/include/bolt/Utils/Utils.h
@@ -41,6 +41,11 @@ std::string getEscapedName(const StringRef &Name);
 /// Return the unescaped name
 std::string getUnescapedName(const StringRef &Name);
 
+/// Return a common part for a given \p Name wrt a given \p Suffixes list.
+/// Preserve the suffix if \p KeepSuffix is set, only dropping characters
+/// following it, otherwise drop the suffix as well.
+std::optional<StringRef> getCommonName(const StringRef Name, bool KeepSuffix,
+                                       ArrayRef<StringRef> Suffixes);
 /// LTO-generated function names take a form:
 ///
 ///   <function_name>.lto_priv.<decimal_number>/...
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index fef721167869d..862208abbbd1a 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -14,6 +14,7 @@
 #include "bolt/Rewrite/MetadataRewriter.h"
 #include "bolt/Rewrite/MetadataRewriters.h"
 #include "bolt/Utils/CommandLineOpts.h"
+#include "bolt/Utils/Utils.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/Support/CommandLine.h"
@@ -133,10 +134,16 @@ void PseudoProbeRewriter::parsePseudoProbe() {
 
   MCPseudoProbeDecoder::Uint64Set GuidFilter;
   MCPseudoProbeDecoder::Uint64Map FuncStartAddrs;
+  SmallVector<StringRef, 3> Suffixes({".destroy", ".resume", ".llvm."});
   for (const BinaryFunction *F : BC.getAllBinaryFunctions()) {
     for (const MCSymbol *Sym : F->getSymbols()) {
-      FuncStartAddrs[Function::getGUID(NameResolver::restore(Sym->getName()))] =
-          F->getAddress();
+      StringRef SymName = NameResolver::restore(Sym->getName());
+      if (std::optional<StringRef> CommonName =
+              getCommonName(SymName, false, Suffixes)) {
+        SymName = *CommonName;
+      }
+      uint64_t GUID = Function::getGUID(SymName);
+      FuncStartAddrs[GUID] = F->getAddress();
     }
   }
   Contents = PseudoProbeSection->getContents();
@@ -155,13 +162,25 @@ void PseudoProbeRewriter::parsePseudoProbe() {
     ProbeDecoder.printProbesForAllAddresses(outs());
   }
 
-  for (const auto &FuncDesc : ProbeDecoder.getGUID2FuncDescMap()) {
-    uint64_t GUID = FuncDesc.FuncGUID;
-    if (!FuncStartAddrs.contains(GUID))
-      continue;
-    BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncStartAddrs[GUID]);
-    assert(BF);
-    BF->setGUID(GUID);
+  const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap();
+  // Checks GUID in GUID2Func and returns it if it's present or null otherwise.
+  auto checkGUID = [&](StringRef SymName) {
+    uint64_t GUID = Function::getGUID(SymName);
+    if (GUID2Func.find(GUID) == GUID2Func.end())
+      return 0ull;
+    return GUID;
+  };
+  for (BinaryFunction *F : BC.getAllBinaryFunctions()) {
+    for (const MCSymbol *Sym : F->getSymbols()) {
+      StringRef SymName = NameResolver::restore(Sym->getName());
+      uint64_t GUID = checkGUID(SymName);
+      std::optional<StringRef> CommonName =
+          getCommonName(SymName, false, Suffixes);
+      if (!GUID && CommonName)
+        GUID = checkGUID(*CommonName);
+      if (GUID)
+        F->setGUID(GUID);
+    }
   }
 }
 
diff --git a/bolt/lib/Utils/Utils.cpp b/bolt/lib/Utils/Utils.cpp
index 718e97535fd22..ecc2f1010a985 100644
--- a/bolt/lib/Utils/Utils.cpp
+++ b/bolt/lib/Utils/Utils.cpp
@@ -66,15 +66,21 @@ std::string getUnescapedName(const StringRef &Name) {
   return Output;
 }
 
-std::optional<StringRef> getLTOCommonName(const StringRef Name) {
-  for (StringRef Suffix : {".__uniq.", ".lto_priv.", ".constprop.", ".llvm."}) {
+std::optional<StringRef> getCommonName(const StringRef Name, bool KeepSuffix,
+                                       ArrayRef<StringRef> Suffixes) {
+  for (StringRef Suffix : Suffixes) {
     size_t LTOSuffixPos = Name.find(Suffix);
     if (LTOSuffixPos != StringRef::npos)
-      return Name.substr(0, LTOSuffixPos + Suffix.size());
+      return Name.substr(0, LTOSuffixPos + (KeepSuffix ? Suffix.size() : 0));
   }
   return std::nullopt;
 }
 
+std::optional<StringRef> getLTOCommonName(const StringRef Name) {
+  return getCommonName(Name, true,
+                       {".__uniq.", ".lto_priv.", ".constprop.", ".llvm."});
+}
+
 std::optional<uint8_t> readDWARFExpressionTargetReg(StringRef ExprBytes) {
   uint8_t Opcode = ExprBytes[0];
   if (Opcode == dwarf::DW_CFA_def_cfa_expression)

From e0a705e3f79c40426af9e4decdcac4cab7129cb4 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 26 Aug 2024 13:53:31 -0700
Subject: [PATCH 38/42] [BOLT] Only parse probes for profiled functions in
 profile-write-pseudo-probes mode

---
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 25 +++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 862208abbbd1a..4b3f9ab4cb64a 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -72,7 +72,8 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 
   /// Parse .pseudo_probe_desc section and .pseudo_probe section
   /// Setup Pseudo probe decoder
-  void parsePseudoProbe();
+  /// If \p ProfiledOnly is set, only parse records for functions with profile.
+  void parsePseudoProbe(bool ProfiledOnly = false);
 
   /// PseudoProbe decoder
   std::shared_ptr<MCPseudoProbeDecoder> ProbeDecoderPtr;
@@ -92,7 +93,7 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 
 Error PseudoProbeRewriter::preCFGInitializer() {
   if (opts::ProfileWritePseudoProbes)
-    parsePseudoProbe();
+    parsePseudoProbe(true);
 
   return Error::success();
 }
@@ -105,7 +106,7 @@ Error PseudoProbeRewriter::postEmitFinalizer() {
   return Error::success();
 }
 
-void PseudoProbeRewriter::parsePseudoProbe() {
+void PseudoProbeRewriter::parsePseudoProbe(bool ProfiledOnly) {
   MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr);
   PseudoProbeDescSection = BC.getUniqueSectionByName(".pseudo_probe_desc");
   PseudoProbeSection = BC.getUniqueSectionByName(".pseudo_probe");
@@ -136,6 +137,7 @@ void PseudoProbeRewriter::parsePseudoProbe() {
   MCPseudoProbeDecoder::Uint64Map FuncStartAddrs;
   SmallVector<StringRef, 3> Suffixes({".destroy", ".resume", ".llvm."});
   for (const BinaryFunction *F : BC.getAllBinaryFunctions()) {
+    bool HasProfile = F->hasProfileAvailable();
     for (const MCSymbol *Sym : F->getSymbols()) {
       StringRef SymName = NameResolver::restore(Sym->getName());
       if (std::optional<StringRef> CommonName =
@@ -144,6 +146,23 @@ void PseudoProbeRewriter::parsePseudoProbe() {
       }
       uint64_t GUID = Function::getGUID(SymName);
       FuncStartAddrs[GUID] = F->getAddress();
+      if (ProfiledOnly && HasProfile)
+        GuidFilter.insert(GUID);
+      std::optional<StringRef> CommonName =
+          getCommonName(SymName, false, Suffixes);
+      if (!CommonName)
+        continue;
+      GUID = Function::getGUID(*CommonName);
+      FuncStartAddrs.try_emplace(GUID, F->getAddress());
+      if (ProfiledOnly && HasProfile)
+        GuidFilter.insert(GUID);
+    }
+  }
+  if (ProfiledOnly) {
+    for (const auto &FuncDesc : ProbeDecoder.getGUID2FuncDescMap()) {
+      uint64_t GUID = FuncDesc.FuncGUID;
+      if (!FuncStartAddrs.contains(GUID))
+        GuidFilter.insert(GUID);
     }
   }
   Contents = PseudoProbeSection->getContents();

From 66fe5d50d65cb6bdda52076c4073508be0a5bc60 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 30 Aug 2024 20:31:37 -0700
Subject: [PATCH 39/42] [BOLT] Add pseudo probe inline tree to YAML profile

To be used for pseudo probe function matching (#100446).

Test Plan: updated pseudoprobe-decoding-inline.test

Pull Request: https://github.com/llvm/llvm-project/pull/107137
---
 .../include/bolt/Profile/ProfileYAMLMapping.h | 49 +++++++++++----
 bolt/lib/Profile/DataAggregator.cpp           | 53 ++++++++++++++---
 bolt/lib/Profile/YAMLProfileWriter.cpp        | 59 +++++++++++++++----
 .../test/X86/pseudoprobe-decoding-inline.test | 31 ++++++----
 4 files changed, 153 insertions(+), 39 deletions(-)

diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
index 2a0514d7d9304..f0cc116ebc6cb 100644
--- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h
+++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
@@ -95,24 +95,28 @@ template <> struct MappingTraits<bolt::SuccessorInfo> {
 
 namespace bolt {
 struct PseudoProbeInfo {
-  llvm::yaml::Hex64 GUID;
   uint64_t Index;
+  uint32_t InlineTreeIndex;
+  llvm::yaml::Hex32 Offset{0};
   uint8_t Type;
 
   bool operator==(const PseudoProbeInfo &Other) const {
-    return GUID == Other.GUID && Index == Other.Index;
+    return InlineTreeIndex == Other.InlineTreeIndex && Index == Other.Index;
   }
-  bool operator!=(const PseudoProbeInfo &Other) const {
-    return !(*this == Other);
+  bool operator<(const PseudoProbeInfo &Other) const {
+    if (InlineTreeIndex == Other.InlineTreeIndex)
+      return Index < Other.Index;
+    return InlineTreeIndex < Other.InlineTreeIndex;
   }
 };
 } // end namespace bolt
 
 template <> struct MappingTraits<bolt::PseudoProbeInfo> {
   static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) {
-    YamlIO.mapRequired("guid", PI.GUID);
     YamlIO.mapRequired("id", PI.Index);
     YamlIO.mapRequired("type", PI.Type);
+    YamlIO.mapOptional("inline_tree_id", PI.InlineTreeIndex, (uint32_t)0);
+    YamlIO.mapOptional("offset", PI.Offset, (uint32_t)0);
   }
 
   static const bool flow = true;
@@ -122,7 +126,7 @@ template <> struct MappingTraits<bolt::PseudoProbeInfo> {
 
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::CallSiteInfo)
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::SuccessorInfo)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::bolt::PseudoProbeInfo)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::PseudoProbeInfo)
 
 namespace llvm {
 namespace yaml {
@@ -163,10 +167,35 @@ template <> struct MappingTraits<bolt::BinaryBasicBlockProfile> {
   }
 };
 
+namespace bolt {
+struct InlineTreeInfo {
+  uint32_t Index;
+  uint32_t ParentIndex;
+  uint32_t CallSiteProbe;
+  llvm::yaml::Hex64 GUID;
+  llvm::yaml::Hex64 Hash;
+  bool operator==(const InlineTreeInfo &Other) const {
+    return Index == Other.Index;
+  }
+};
+} // end namespace bolt
+
+template <> struct MappingTraits<bolt::InlineTreeInfo> {
+  static void mapping(IO &YamlIO, bolt::InlineTreeInfo &ITI) {
+    YamlIO.mapRequired("guid", ITI.GUID);
+    YamlIO.mapRequired("hash", ITI.Hash);
+    YamlIO.mapRequired("id", ITI.Index);
+    YamlIO.mapOptional("parent", ITI.ParentIndex, (uint32_t)0);
+    YamlIO.mapOptional("callsite", ITI.CallSiteProbe, 0);
+  }
+
+  static const bool flow = true;
+};
 } // end namespace yaml
 } // end namespace llvm
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::BinaryBasicBlockProfile)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::bolt::InlineTreeInfo)
 
 namespace llvm {
 namespace yaml {
@@ -179,8 +208,7 @@ struct BinaryFunctionProfile {
   llvm::yaml::Hex64 Hash{0};
   uint64_t ExecCount{0};
   std::vector<BinaryBasicBlockProfile> Blocks;
-  llvm::yaml::Hex64 GUID{0};
-  llvm::yaml::Hex64 PseudoProbeDescHash{0};
+  std::vector<InlineTreeInfo> InlineTree;
   bool Used{false};
 };
 } // end namespace bolt
@@ -194,9 +222,8 @@ template <> struct MappingTraits<bolt::BinaryFunctionProfile> {
     YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks);
     YamlIO.mapOptional("blocks", BFP.Blocks,
                        std::vector<bolt::BinaryBasicBlockProfile>());
-    YamlIO.mapOptional("guid", BFP.GUID, (uint64_t)0);
-    YamlIO.mapOptional("pseudo_probe_desc_hash", BFP.PseudoProbeDescHash,
-                       (uint64_t)0);
+    YamlIO.mapOptional("inline_tree", BFP.InlineTree,
+                       std::vector<bolt::InlineTreeInfo>());
   }
 };
 
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 10d745cc69824..803cc4725b570 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <optional>
+#include <queue>
 #include <unordered_map>
 #include <utility>
 
@@ -2402,12 +2403,43 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         const unsigned BlockIndex = BlockMap.getBBIndex(BI.To.Offset);
         YamlBF.Blocks[BlockIndex].ExecCount += BI.Branches;
       }
-      if (PseudoProbeDecoder) {
-        if ((YamlBF.GUID = BF->getGUID())) {
-          const MCPseudoProbeFuncDesc *FuncDesc =
-              PseudoProbeDecoder->getFuncDescForGUID(YamlBF.GUID);
-          YamlBF.PseudoProbeDescHash = FuncDesc->FuncHash;
+      DenseMap<const MCDecodedPseudoProbeInlineTree *, uint32_t>
+          InlineTreeNodeId;
+      if (PseudoProbeDecoder && BF->getGUID()) {
+        std::queue<const MCDecodedPseudoProbeInlineTree *> ITWorklist;
+        // FIXME: faster inline tree lookup by top-level GUID
+        if (const MCDecodedPseudoProbeInlineTree *InlineTree = llvm::find_if(
+                PseudoProbeDecoder->getDummyInlineRoot().getChildren(),
+                [&](const auto &InlineTree) {
+                  return InlineTree.Guid == BF->getGUID();
+                })) {
+          ITWorklist.push(InlineTree);
+          InlineTreeNodeId[InlineTree] = 0;
+          auto Hash =
+              PseudoProbeDecoder->getFuncDescForGUID(BF->getGUID())->FuncHash;
+          YamlBF.InlineTree.emplace_back(
+              yaml::bolt::InlineTreeInfo{0, 0, 0, BF->getGUID(), Hash});
+        }
+        uint32_t ParentId = 0;
+        uint32_t NodeId = 1;
+        while (!ITWorklist.empty()) {
+          const MCDecodedPseudoProbeInlineTree *Cur = ITWorklist.front();
+          for (const MCDecodedPseudoProbeInlineTree &Child :
+               Cur->getChildren()) {
+            InlineTreeNodeId[&Child] = NodeId;
+            auto Hash =
+                PseudoProbeDecoder->getFuncDescForGUID(Child.Guid)->FuncHash;
+            YamlBF.InlineTree.emplace_back(yaml::bolt::InlineTreeInfo{
+                NodeId++, ParentId, std::get<1>(Child.getInlineSite()),
+                Child.Guid, Hash});
+            ITWorklist.push(&Child);
+          }
+          ITWorklist.pop();
+          ++ParentId;
         }
+      }
+
+      if (PseudoProbeDecoder) {
         // Fetch probes belonging to all fragments
         const AddressProbesMap &ProbeMap =
             PseudoProbeDecoder->getAddress2ProbesMap();
@@ -2420,12 +2452,19 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
             const uint32_t OutputAddress = Probe.getAddress();
             const uint32_t InputOffset = BAT->translate(
                 FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true);
-            const unsigned BlockIndex = getBlock(InputOffset).second;
+            const auto [BlockOffset, BlockIndex] = getBlock(InputOffset);
+            uint32_t NodeId = InlineTreeNodeId[Probe.getInlineTreeNode()];
+            uint32_t Offset = InputOffset - BlockOffset;
             YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
-                yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
+                yaml::bolt::PseudoProbeInfo{Probe.getIndex(), NodeId, Offset,
                                             Probe.getType()});
           }
         }
+        for (yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
+          llvm::sort(YamlBB.PseudoProbes);
+          YamlBB.PseudoProbes.erase(llvm::unique(YamlBB.PseudoProbes),
+                                    YamlBB.PseudoProbes.end());
+        }
       }
       // Drop blocks without a hash, won't be useful for stale matching.
       llvm::erase_if(YamlBF.Blocks,
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index ffbf2388e912f..817689230e2a7 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -12,11 +12,15 @@
 #include "bolt/Profile/BoltAddressTranslation.h"
 #include "bolt/Profile/DataAggregator.h"
 #include "bolt/Profile/ProfileReaderBase.h"
+#include "bolt/Profile/ProfileYAMLMapping.h"
 #include "bolt/Rewrite/RewriteInstance.h"
 #include "bolt/Utils/CommandLineOpts.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
+#include <deque>
+#include <queue>
 
 #undef  DEBUG_TYPE
 #define DEBUG_TYPE "bolt-prof"
@@ -77,13 +81,6 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   YamlBF.Hash = BF.getHash();
   YamlBF.NumBasicBlocks = BF.size();
   YamlBF.ExecCount = BF.getKnownExecutionCount();
-  if (PseudoProbeDecoder) {
-    if ((YamlBF.GUID = BF.getGUID())) {
-      const MCPseudoProbeFuncDesc *FuncDesc =
-          PseudoProbeDecoder->getFuncDescForGUID(YamlBF.GUID);
-      YamlBF.PseudoProbeDescHash = FuncDesc->FuncHash;
-    }
-  }
 
   BinaryFunction::BasicBlockOrderType Order;
   llvm::copy(UseDFS ? BF.dfs() : BF.getLayout().blocks(),
@@ -92,6 +89,40 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   const FunctionLayout Layout = BF.getLayout();
   Layout.updateLayoutIndices(Order);
 
+  DenseMap<const MCDecodedPseudoProbeInlineTree *, uint32_t> InlineTreeNodeId;
+  if (PseudoProbeDecoder && BF.getGUID()) {
+    std::queue<const MCDecodedPseudoProbeInlineTree *> ITWorklist;
+    // FIXME: faster inline tree lookup by top-level GUID
+    if (const MCDecodedPseudoProbeInlineTree *InlineTree = llvm::find_if(
+            PseudoProbeDecoder->getDummyInlineRoot().getChildren(),
+            [&](const auto &InlineTree) {
+              return InlineTree.Guid == BF.getGUID();
+            })) {
+      ITWorklist.push(InlineTree);
+      InlineTreeNodeId[InlineTree] = 0;
+      auto Hash =
+          PseudoProbeDecoder->getFuncDescForGUID(BF.getGUID())->FuncHash;
+      YamlBF.InlineTree.emplace_back(
+          yaml::bolt::InlineTreeInfo{0, 0, 0, BF.getGUID(), Hash});
+    }
+    uint32_t ParentId = 0;
+    uint32_t NodeId = 1;
+    while (!ITWorklist.empty()) {
+      const MCDecodedPseudoProbeInlineTree *Cur = ITWorklist.front();
+      for (const MCDecodedPseudoProbeInlineTree &Child : Cur->getChildren()) {
+        InlineTreeNodeId[&Child] = NodeId;
+        auto Hash =
+            PseudoProbeDecoder->getFuncDescForGUID(Child.Guid)->FuncHash;
+        YamlBF.InlineTree.emplace_back(yaml::bolt::InlineTreeInfo{
+            NodeId++, ParentId, std::get<1>(Child.getInlineSite()), Child.Guid,
+            Hash});
+        ITWorklist.push(&Child);
+      }
+      ITWorklist.pop();
+      ++ParentId;
+    }
+  }
+
   for (const BinaryBasicBlock *BB : Order) {
     yaml::bolt::BinaryBasicBlockProfile YamlBB;
     YamlBB.Index = BB->getLayoutIndex();
@@ -198,10 +229,18 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
       const uint64_t FuncAddr = BF.getAddress();
       const std::pair<uint64_t, uint64_t> &BlockRange =
           BB->getInputAddressRange();
-      for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(
-               FuncAddr + BlockRange.first, FuncAddr + BlockRange.second))
+      const std::pair<uint64_t, uint64_t> BlockAddrRange = {
+          FuncAddr + BlockRange.first, FuncAddr + BlockRange.second};
+      for (const MCDecodedPseudoProbe &Probe :
+           ProbeMap.find(BlockAddrRange.first, BlockAddrRange.second)) {
+        uint32_t NodeId = InlineTreeNodeId[Probe.getInlineTreeNode()];
+        uint32_t Offset = Probe.getAddress() - BlockAddrRange.first;
         YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
-            Probe.getGuid(), Probe.getIndex(), Probe.getType()});
+            Probe.getIndex(), NodeId, Offset, Probe.getType()});
+      }
+      llvm::sort(YamlBB.PseudoProbes);
+      YamlBB.PseudoProbes.erase(llvm::unique(YamlBB.PseudoProbes),
+                                YamlBB.PseudoProbes.end());
     }
 
     YamlBF.Blocks.emplace_back(YamlBB);
diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test
index 1fdd00c7ef6c4..629dd84ab8e1d 100644
--- a/bolt/test/X86/pseudoprobe-decoding-inline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-inline.test
@@ -14,29 +14,38 @@
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   pseudo_probes: [ { guid: 0xE413754A191DB537, id: 1, type: 0 }, { guid: 0xE413754A191DB537, id: 4, type: 0 } ]
-# CHECK-YAML: guid: 0xE413754A191DB537
-# CHECK-YAML: pseudo_probe_desc_hash: 0x10E852DA94
+# CHECK-YAML:      pseudo_probes:
+# CHECK-YAML-NEXT:   - { id: 1, type: 0
+# CHECK-YAML-NEXT:   - { id: 4, type: 0
+# CHECK-YAML:      inline_tree:
+# CHECK-YAML-NEXT:   - { guid: 0xE413754A191DB537, hash: 0x10E852DA94, id: 0 }
 #
 # CHECK-YAML: name: foo
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   pseudo_probes: [ { guid: 0x5CF8C24CDB18BDAC, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 2, type: 0 } ]
-# CHECK-YAML: guid: 0x5CF8C24CDB18BDAC
-# CHECK-YAML: pseudo_probe_desc_hash: 0x200205A19C5B4
+# CHECK-YAML:      pseudo_probes:
+# CHECK-YAML-NEXT: - { id: 1, type: 0 }
+# CHECK-YAML-NEXT: - { id: 2, type: 0 }
+# CHECK-YAML:      inline_tree:
+# CHECK-YAML-NEXT:   - { guid: 0x5CF8C24CDB18BDAC, hash: 0x200205A19C5B4, id: 0 }
+# CHECK-YAML-NEXT:   - { guid: 0xE413754A191DB537, hash: 0x10E852DA94, id: 1, callsite: 8 }
 #
 # CHECK-YAML: name: main
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   pseudo_probes: [ { guid: 0xDB956436E78DD5FA, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 1, type: 0 }, { guid: 0x5CF8C24CDB18BDAC, id: 2, type: 0 } ]
-# CHECK-YAML: guid: 0xDB956436E78DD5FA
-# CHECK-YAML: pseudo_probe_desc_hash: 0x10000FFFFFFFF
+# CHECK-YAML:      pseudo_probes:
+# CHECK-YAML-NEXT: - { id: 1, type: 0 }
+# CHECK-YAML-NEXT: - { id: 1, type: 0, inline_tree_id: 1 }
+# CHECK-YAML-NEXT: - { id: 2, type: 0, inline_tree_id: 1 }
+# CHECK-YAML:      inline_tree:
+# CHECK-YAML-NEXT:   - { guid: 0xDB956436E78DD5FA, hash: 0x10000FFFFFFFF, id: 0 }
+# CHECK-YAML-NEXT:   - { guid: 0x5CF8C24CDB18BDAC, hash: 0x200205A19C5B4, id: 1, callsite: 2 }
+# CHECK-YAML-NEXT:   - { guid: 0xE413754A191DB537, hash: 0x10E852DA94, id: 2, parent: 1, callsite: 8 }
 #
 ## Check that without --profile-write-pseudo-probes option, no pseudo probes are
 ## generated
 # RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata
 # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-NO-OPT
 # CHECK-NO-OPT-NOT: pseudo_probes
-# CHECK-NO-OPT-NOT: guid
-# CHECK-NO-OPT-NOT: pseudo_probe_desc_hash
+# CHECK-NO-OPT-NOT: inline_tree
 
 CHECK: Report of decoding input pseudo probe binaries
 

From 36197b175681d07b4704e576fb008cec3cc1e05e Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Wed, 28 Aug 2024 21:10:25 +0200
Subject: [PATCH 40/42] Reworked block probe matching

Use new probe ifaces
Get all function probes at once
Drop ProfileUsePseudoProbes
Unify matchWithBlockPseudoProbes
Distinguish exact and loose probe match
---
 bolt/include/bolt/Core/BinaryContext.h    |  20 +-
 bolt/lib/Passes/BinaryPasses.cpp          |  40 ++-
 bolt/lib/Profile/StaleProfileMatching.cpp | 404 ++++++++++------------
 bolt/lib/Rewrite/PseudoProbeRewriter.cpp  |   8 +-
 4 files changed, 237 insertions(+), 235 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 3e20cb607e657..3f7b2ac0bc6cf 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -724,14 +724,26 @@ class BinaryContext {
     uint32_t NumStaleBlocks{0};
     ///   the number of exactly matched basic blocks
     uint32_t NumExactMatchedBlocks{0};
-    ///   the number of pseudo probe matched basic blocks
-    uint32_t NumPseudoProbeMatchedBlocks{0};
+    ///   the number of loosely matched basic blocks
+    uint32_t NumLooseMatchedBlocks{0};
+    ///   the number of exactly pseudo probe matched basic blocks
+    uint32_t NumPseudoProbeExactMatchedBlocks{0};
+    ///   the number of loosely pseudo probe matched basic blocks
+    uint32_t NumPseudoProbeLooseMatchedBlocks{0};
+    ///   the number of call matched basic blocks
+    uint32_t NumCallMatchedBlocks{0};
     ///   the total count of samples in the profile
     uint64_t StaleSampleCount{0};
     ///   the count of exactly matched samples
     uint64_t ExactMatchedSampleCount{0};
-    ///   the count of pseudo probe matched samples
-    uint64_t PseudoProbeMatchedSampleCount{0};
+    ///   the count of exactly matched samples
+    uint64_t LooseMatchedSampleCount{0};
+    ///   the count of exactly pseudo probe matched samples
+    uint64_t PseudoProbeExactMatchedSampleCount{0};
+    ///   the count of loosely pseudo probe matched samples
+    uint64_t PseudoProbeLooseMatchedSampleCount{0};
+    ///   the count of call matched samples
+    uint64_t CallMatchedSampleCount{0};
     ///   the number of stale functions that have matching number of blocks in
     ///   the profile
     uint64_t NumStaleFuncsWithEqualBlockCount{0};
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index b786f07a6a665..8edbd58c3ed3d 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1524,15 +1524,43 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
         100.0 * BC.Stats.ExactMatchedSampleCount / BC.Stats.StaleSampleCount,
         BC.Stats.ExactMatchedSampleCount, BC.Stats.StaleSampleCount);
     BC.outs() << format(
-        "BOLT-INFO: inference found a pseudo probe match for %.2f%% of basic "
+        "BOLT-INFO: inference found an exact pseudo probe match for %.2f%% of "
+        "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumPseudoProbeExactMatchedBlocks /
+            BC.Stats.NumStaleBlocks,
+        BC.Stats.NumPseudoProbeExactMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.PseudoProbeExactMatchedSampleCount /
+            BC.Stats.StaleSampleCount,
+        BC.Stats.PseudoProbeExactMatchedSampleCount, BC.Stats.StaleSampleCount);
+    BC.outs() << format(
+        "BOLT-INFO: inference found a loose pseudo probe match for %.2f%% of "
+        "basic blocks (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumPseudoProbeLooseMatchedBlocks /
+            BC.Stats.NumStaleBlocks,
+        BC.Stats.NumPseudoProbeLooseMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.PseudoProbeLooseMatchedSampleCount /
+            BC.Stats.StaleSampleCount,
+        BC.Stats.PseudoProbeLooseMatchedSampleCount, BC.Stats.StaleSampleCount);
+    BC.outs() << format(
+        "BOLT-INFO: inference found a call match for %.2f%% of basic "
         "blocks"
         " (%zu out of %zu stale) responsible for %.2f%% samples"
         " (%zu out of %zu stale)\n",
-        100.0 * BC.Stats.NumPseudoProbeMatchedBlocks / BC.Stats.NumStaleBlocks,
-        BC.Stats.NumPseudoProbeMatchedBlocks, BC.Stats.NumStaleBlocks,
-        100.0 * BC.Stats.PseudoProbeMatchedSampleCount /
-            BC.Stats.StaleSampleCount,
-        BC.Stats.PseudoProbeMatchedSampleCount, BC.Stats.StaleSampleCount);
+        100.0 * BC.Stats.NumCallMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumCallMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.CallMatchedSampleCount / BC.Stats.StaleSampleCount,
+        BC.Stats.CallMatchedSampleCount, BC.Stats.StaleSampleCount);
+    BC.outs() << format(
+        "BOLT-INFO: inference found a loose match for %.2f%% of basic "
+        "blocks"
+        " (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumLooseMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumLooseMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.LooseMatchedSampleCount / BC.Stats.StaleSampleCount,
+        BC.Stats.LooseMatchedSampleCount, BC.Stats.StaleSampleCount);
   }
 
   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index ef9320ae168fe..2ec74ac7549f7 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -29,6 +29,7 @@
 #include "bolt/Profile/YAMLProfileReader.h"
 #include "llvm/ADT/Bitfields.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/MC/MCPseudoProbe.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/xxhash.h"
@@ -46,7 +47,6 @@ namespace opts {
 extern cl::opt<bool> TimeRewrite;
 extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<unsigned> Verbosity;
-extern cl::opt<bool> ProfileUsePseudoProbes;
 
 cl::opt<bool>
     InferStaleProfile("infer-stale-profile",
@@ -198,8 +198,6 @@ struct BlendedBlockHash {
 /// release.
 class StaleMatcher {
 public:
-  StaleMatcher(const uint64_t YamlBFGUID) : YamlBFGUID(YamlBFGUID) {}
-
   /// Initialize stale matcher.
   void init(const std::vector<FlowBlock *> &Blocks,
             const std::vector<BlendedBlockHash> &Hashes,
@@ -217,39 +215,38 @@ class StaleMatcher {
     }
   }
 
-  /// Creates a mapping from a inlined pseudo probe's guid and index to probe.
-  void mapGUIDAndIndexToProbe(uint64_t Guid, uint64_t Index,
-                              const MCDecodedPseudoProbe *Probe) {
-    IndexAndGUIDToInlinedProbes[Guid][Index].push_back(Probe);
-  }
-
-  /// Creates a mapping from a pseudo probe index to pseudo probe.
-  void mapIndexToProbe(uint64_t Index, const MCDecodedPseudoProbe *Probe) {
-    IndexToProbes[Index].push_back(Probe);
-  }
-
   /// Creates a mapping from a pseudo probe to a flow block.
   void mapProbeToBB(const MCDecodedPseudoProbe *Probe, FlowBlock *Block) {
     BBPseudoProbeToBlock[Probe] = Block;
   }
 
+  enum MatchMethod : char {
+    MATCH_EXACT = 0,
+    MATCH_PROBE_EXACT,
+    MATCH_PROBE_LOOSE,
+    MATCH_OPCODE,
+    MATCH_CALL,
+    NO_MATCH
+  };
+
   /// Find the most similar flow block for a profile block given its hashes and
   /// pseudo probe information.
-  const FlowBlock *
+  std::pair<const FlowBlock *, MatchMethod>
   matchBlock(BlendedBlockHash BlendedHash, uint64_t CallHash,
-             const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) {
-    const FlowBlock *BestBlock = matchWithOpcodes(BlendedHash);
-    if (BestBlock) {
-      ++MatchedWithOpcodes;
-      return BestBlock;
-    }
-    BestBlock = matchWithCalls(BlendedHash, CallHash);
-    if (BestBlock)
-      return BestBlock;
-    BestBlock = matchWithPseudoProbes(BlendedHash, PseudoProbes);
-    if (BestBlock)
-      MatchedWithPseudoProbes.insert(BlendedHash.combine());
-    return BestBlock;
+             const ArrayRef<yaml::bolt::PseudoProbeInfo> PseudoProbes,
+             const ArrayRef<yaml::bolt::InlineTreeInfo> InlineTree) {
+    const auto &[Block, Hash] = matchWithOpcodes(BlendedHash);
+    if (isHighConfidenceMatch(Hash, BlendedHash))
+      return {Block, MATCH_EXACT};
+    const auto &[ProbeBlock, Exact] =
+        matchWithPseudoProbes(PseudoProbes, InlineTree);
+    if (ProbeBlock)
+      return {ProbeBlock, Exact ? MATCH_PROBE_EXACT : MATCH_PROBE_LOOSE};
+    if (const FlowBlock *BestBlock = matchWithCalls(BlendedHash, CallHash))
+      return {BestBlock, MATCH_CALL};
+    if (Block)
+      return {Block, MATCH_OPCODE};
+    return {nullptr, NO_MATCH};
   }
 
   /// Returns true if the two basic blocks (in the binary and in the profile)
@@ -260,48 +257,49 @@ class StaleMatcher {
     return Hash1.InstrHash == Hash2.InstrHash;
   }
 
-  /// Returns true if a profiled block was matched with its pseudo probe.
-  bool isPseudoProbeMatch(BlendedBlockHash YamlBBHash) {
-    return MatchedWithPseudoProbes.find(YamlBBHash.combine()) !=
-           MatchedWithPseudoProbes.end();
+  /// Returns matched InlineTree * for a given profile inline_tree_id.
+  const MCDecodedPseudoProbeInlineTree *
+  getInlineTreeNode(uint32_t ProfileInlineTreeNodeId) const {
+    auto It = InlineTreeNodeMap.find(ProfileInlineTreeNodeId);
+    if (It == InlineTreeNodeMap.end())
+      return nullptr;
+    return It->second;
   }
 
-  /// Returns the number of blocks matched with opcodes.
-  size_t getNumBlocksMatchedWithOpcodes() const { return MatchedWithOpcodes; }
-
-  /// Returns the number of blocks matched with pseudo probes.
-  size_t getNumBlocksMatchedWithPseudoProbes() const {
-    return MatchedWithPseudoProbes.size();
+  void mapInlineTreeNode(uint32_t ProfileNode,
+                         const MCDecodedPseudoProbeInlineTree *BinaryNode) {
+    auto Res = InlineTreeNodeMap.try_emplace(ProfileNode, BinaryNode);
+    assert(Res.second &&
+           "Duplicate mapping from profile node index to binary inline tree");
+    (void)Res;
   }
 
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
   std::unordered_map<uint64_t, std::vector<HashBlockPairType>> CallHashToBlocks;
-  DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>> IndexToProbes;
-  DenseMap<uint64_t,
-           DenseMap<uint64_t, std::vector<const MCDecodedPseudoProbe *>>>
-      IndexAndGUIDToInlinedProbes;
+  DenseMap<uint32_t, const MCDecodedPseudoProbeInlineTree *> InlineTreeNodeMap;
   DenseMap<const MCDecodedPseudoProbe *, FlowBlock *> BBPseudoProbeToBlock;
-  DenseSet<uint64_t> MatchedWithPseudoProbes;
-  const uint64_t YamlBFGUID{0};
-  uint64_t MatchedWithOpcodes{0};
 
-  // Uses OpcodeHash to find the most similar block for a given hash.
-  const FlowBlock *matchWithOpcodes(BlendedBlockHash BlendedHash) const {
+  // Uses OpcodeHash to find the most similar block (with blended hash) for a
+  // given hash.
+  std::pair<const FlowBlock *, BlendedBlockHash>
+  matchWithOpcodes(BlendedBlockHash BlendedHash) const {
     auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
     if (BlockIt == OpHashToBlocks.end())
-      return nullptr;
+      return {nullptr, BlendedBlockHash(0)};
     FlowBlock *BestBlock = nullptr;
     uint64_t BestDist = std::numeric_limits<uint64_t>::max();
+    BlendedBlockHash BestHash;
     for (const auto &[Hash, Block] : BlockIt->second) {
       uint64_t Dist = Hash.distance(BlendedHash);
       if (BestBlock == nullptr || Dist < BestDist) {
         BestDist = Dist;
         BestBlock = Block;
+        BestHash = Hash;
       }
     }
-    return BestBlock;
+    return {BestBlock, BestHash};
   }
 
   // Uses CallHash to find the most similar block for a given hash.
@@ -326,120 +324,71 @@ class StaleMatcher {
     return BestBlock;
   }
 
-  /// A helper function for logging.
-  static bool LogErrIfExpr(bool Expr, StringRef Message) {
-    if (Expr)
-      errs() << Message;
-    return Expr;
-  }
-
-  /// Matches an inlined profile block with an inlined binary block based on
-  /// pseudo probes.
-  const FlowBlock *matchWithInlinedBlockPseudoProbes(
-      SmallVector<const yaml::bolt::PseudoProbeInfo *>
-          &InlinedBlockPseudoProbes) const {
-    if (opts::Verbosity >= 3)
-      outs() << "BOLT-INFO: attempting to match block with inlined block "
-                "pseudo probes\n";
-
-    size_t NInlinedBlockPseudoProbes = InlinedBlockPseudoProbes.size();
-    if (LogErrIfExpr(NInlinedBlockPseudoProbes == 0,
-                     "BOLT-WARNING: no pseudo probes in profile block\n"))
-      return nullptr;
-    if (LogErrIfExpr(
-            NInlinedBlockPseudoProbes > 1,
-            "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
-      return nullptr;
-
-    const auto *InlinedPseudoProbe = InlinedBlockPseudoProbes[0];
-    uint64_t Guid = InlinedPseudoProbe->GUID;
-    uint64_t Index = InlinedPseudoProbe->Index;
-
-    auto GuidIt = IndexAndGUIDToInlinedProbes.find(Guid);
-    if (LogErrIfExpr(
-            GuidIt == IndexAndGUIDToInlinedProbes.end(),
-            "BOLT-WARNING: no pseudo probes found within BB at index\n"))
-      return nullptr;
-    auto IndexIt = GuidIt->second.find(Index);
-    if (LogErrIfExpr(
-            IndexIt == GuidIt->second.end(),
-            "BOLT-WARNING: no pseudo probes found within BB at index\n"))
-      return nullptr;
-
-    if (LogErrIfExpr(
-            IndexIt->second.size() > 1,
-            "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n"))
-      return nullptr;
-
-    const MCDecodedPseudoProbe *BinaryPseudoProbe = IndexIt->second[0];
-    auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
-    assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
-           "All binary pseudo probes should belong a binary basic block");
-
-    return BinaryPseudoProbeIt->second;
-  }
-
   /// Matches a profile block with an binary block based on pseudo probes.
-  const FlowBlock *matchWithNonInlinedBlockPseudoProbes(
-      SmallVector<const yaml::bolt::PseudoProbeInfo *> &BlockPseudoProbes)
-      const {
-    if (opts::Verbosity >= 3)
-      outs() << "BOLT-INFO: attempting to match block with inlined block "
-                "pseudo probes\n";
-
-    size_t NBlockPseudoProbes = BlockPseudoProbes.size();
-    if (LogErrIfExpr(NBlockPseudoProbes == 0,
-                     "BOLT-WARNING: no pseudo probes in profile block\n"))
-      return nullptr;
-    if (LogErrIfExpr(
-            NBlockPseudoProbes > 1,
-            "BOLT-WARNING: more than 1 pseudo probes in profile block\n"))
-      return nullptr;
-    uint64_t Index = BlockPseudoProbes[0]->Index;
-    auto It = IndexToProbes.find(Index);
-    if (LogErrIfExpr(
-            It == IndexToProbes.end(),
-            "BOLT-WARNING: no block pseudo probes found within BB at index\n"))
-      return nullptr;
-    if (LogErrIfExpr(
-            It->second.size() > 1,
-            "BOLT-WARNING: more than 1 block pseudo probes in BB at index\n"))
-      return nullptr;
-    const MCDecodedPseudoProbe *BinaryPseudoProbe = It->second[0];
-    auto BinaryPseudoProbeIt = BBPseudoProbeToBlock.find(BinaryPseudoProbe);
-    assert(BinaryPseudoProbeIt != BBPseudoProbeToBlock.end() &&
-           "All binary pseudo probes should belong a binary basic block");
+  /// Returns the best matching block (or nullptr) and whether the match is
+  /// unambiguous.
+  std::pair<const FlowBlock *, bool> matchWithPseudoProbes(
+      const ArrayRef<yaml::bolt::PseudoProbeInfo> BlockPseudoProbes,
+      const ArrayRef<yaml::bolt::InlineTreeInfo> InlineTree) const {
+    if (!opts::StaleMatchingWithBlockPseudoProbes)
+      return {nullptr, false};
+
+    auto logIf = [](bool Expr, StringRef Message) {
+      LLVM_DEBUG(if (Expr) errs() << Message << '\n');
+      return Expr;
+    };
 
-    return BinaryPseudoProbeIt->second;
-  }
+    DenseMap<const FlowBlock *, uint32_t> FlowBlockMatchCount;
 
-  /// Uses pseudo probe information to attach the profile to the appropriate
-  /// block.
-  const FlowBlock *matchWithPseudoProbes(
-      BlendedBlockHash BlendedHash,
-      const std::vector<yaml::bolt::PseudoProbeInfo> &PseudoProbes) const {
-    if (!opts::StaleMatchingWithBlockPseudoProbes || !YamlBFGUID)
-      return nullptr;
-
-    // Searches for the pseudo probe attached to the matched function's block.
-    SmallVector<const yaml::bolt::PseudoProbeInfo *> BlockPseudoProbes;
-    SmallVector<const yaml::bolt::PseudoProbeInfo *> InlinedBlockPseudoProbes;
-    for (const auto &PseudoProbe : PseudoProbes) {
-      // Skips pseudo probes attached to function calls.
-      if (PseudoProbe.Type != static_cast<uint8_t>(PseudoProbeType::Block))
+    for (const yaml::bolt::PseudoProbeInfo &Probe : BlockPseudoProbes) {
+      const MCDecodedPseudoProbeInlineTree *InlineTreeNode =
+          getInlineTreeNode(Probe.InlineTreeIndex);
+      if (logIf(!InlineTreeNode,
+                formatv("no matching inline tree node for {0} {1}",
+                        Probe.InlineTreeIndex, Probe.Index).str())) {
+        ++FlowBlockMatchCount[nullptr];
         continue;
-      if (PseudoProbe.GUID != YamlBFGUID)
-        InlinedBlockPseudoProbes.push_back(&PseudoProbe);
-      else
-        BlockPseudoProbes.push_back(&PseudoProbe);
+      }
+      const MCDecodedPseudoProbe *BinaryProbe = nullptr;
+      for (const MCDecodedPseudoProbe &FuncProbe :
+           InlineTreeNode->getProbes()) {
+        if (FuncProbe.getIndex() != Probe.Index)
+          continue;
+        BinaryProbe = &FuncProbe;
+        break;
+      }
+      if (logIf(!BinaryProbe, formatv("no matching binary probe for {0} {1}",
+                                      Probe.InlineTreeIndex, Probe.Index)
+                                  .str())) {
+        ++FlowBlockMatchCount[nullptr];
+        continue;
+      }
+      auto It = BBPseudoProbeToBlock.find(BinaryProbe);
+      if (logIf(It == BBPseudoProbeToBlock.end(),
+                formatv("no probe->block for {0} {1}", Probe.InlineTreeIndex,
+                        Probe.Index)
+                    .str())) {
+        ++FlowBlockMatchCount[nullptr];
+        continue;
+      }
+      const FlowBlock *Block = It->second;
+      ++FlowBlockMatchCount[Block];
+    }
+    uint32_t BestMatchCount = 0;
+    uint32_t TotalMatchCount = 0;
+    const FlowBlock *BestMatchBlock = nullptr;
+    for (auto &[Block, Count] : FlowBlockMatchCount) {
+      logIf(true, formatv("block {0} count {1}",
+                          Block ? Block->Index : UINT64_MAX, Count)
+                      .str());
+      TotalMatchCount += Count;
+      if (Count > BestMatchCount ||
+          (Count == BestMatchCount && !BestMatchBlock)) {
+        BestMatchBlock = Block;
+        BestMatchCount = Count;
+      }
     }
-    // Returns nullptr if there is not a 1:1 mapping of the profile block pseudo
-    // probe and a binary block pseudo probe.
-    const FlowBlock *MatchedInlinedBlock =
-        matchWithInlinedBlockPseudoProbes(InlinedBlockPseudoProbes);
-    return MatchedInlinedBlock
-               ? MatchedInlinedBlock
-               : matchWithNonInlinedBlockPseudoProbes(BlockPseudoProbes);
+    return {BestMatchBlock, BestMatchCount / TotalMatchCount};
   }
 };
 
@@ -630,26 +579,7 @@ size_t matchWeightsByHashes(
 
   assert(Func.Blocks.size() == BlockOrder.size() + 2);
 
-  // Sets the YamlBFGUID in the StaleMatcher such that if either the profiled or
-  // binary function dne or they are not equal, to zero, as not to perform
-  // pseudo probe block matching. Otherwise, the YamlBF's GUID is used for
-  // pseudo probe block matching.
-  const MCPseudoProbeDecoder *PseudoProbeDecoder =
-      opts::ProfileUsePseudoProbes && opts::StaleMatchingWithBlockPseudoProbes
-          ? BC.getPseudoProbeDecoder()
-          : nullptr;
-  uint64_t BFPseudoProbeDescHash = 0;
-  if (opts::ProfileUsePseudoProbes &&
-      opts::StaleMatchingWithBlockPseudoProbes && BF.getGUID() != 0) {
-    assert(PseudoProbeDecoder &&
-           "If BF has pseudo probe, BC should have a pseudo probe decoder");
-    auto &GUID2FuncDescMap = PseudoProbeDecoder->getGUID2FuncDescMap();
-    auto It = GUID2FuncDescMap.find(BF.getGUID());
-    if (It != GUID2FuncDescMap.end())
-      BFPseudoProbeDescHash = It->second.FuncHash;
-  }
-
-  StaleMatcher Matcher(YamlBF.GUID);
+  StaleMatcher Matcher;
   std::vector<uint64_t> CallHashes;
   std::vector<FlowBlock *> Blocks;
   std::vector<BlendedBlockHash> BlendedHashes;
@@ -672,38 +602,55 @@ size_t matchWeightsByHashes(
     Blocks.push_back(&Func.Blocks[I + 1]);
     BlendedBlockHash BlendedHash(BB->getHash());
     BlendedHashes.push_back(BlendedHash);
-    // Collects pseudo probes attached to the BB for use in the StaleMatcher.
-    if (opts::ProfileUsePseudoProbes &&
-        opts::StaleMatchingWithBlockPseudoProbes && BFPseudoProbeDescHash &&
-        YamlBF.PseudoProbeDescHash &&
-        BFPseudoProbeDescHash == YamlBF.PseudoProbeDescHash) {
-      assert(PseudoProbeDecoder &&
-             "If pseudo probes are in use, psuedo probe decoder should exist");
-      const AddressProbesMap &ProbeMap =
-          PseudoProbeDecoder->getAddress2ProbesMap();
-      const uint64_t FuncAddr = BF.getAddress();
-      const std::pair<uint64_t, uint64_t> &BlockRange =
-          BB->getInputAddressRange();
-      const auto &BlockProbes =
-          llvm::make_range(ProbeMap.lower_bound(FuncAddr + BlockRange.first),
-                           ProbeMap.lower_bound(FuncAddr + BlockRange.second));
-      for (const auto &[_, Probes] : BlockProbes) {
-        for (const MCDecodedPseudoProbe &Probe : Probes) {
-          if (Probe.getType() != static_cast<uint8_t>(PseudoProbeType::Block))
-            continue;
-          if (Probe.getInlineTreeNode()->hasInlineSite())
-            Matcher.mapGUIDAndIndexToProbe(Probe.getGuid(), Probe.getIndex(),
-                                           &Probe);
-          else
-            Matcher.mapIndexToProbe(Probe.getIndex(), &Probe);
-          Matcher.mapProbeToBB(&Probe, Blocks[I]);
-        }
-      }
-    }
 
     LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = "
                       << Twine::utohexstr(BB->getHash()) << "\n");
   }
+  // Collects function pseudo probes for use in the StaleMatcher.
+  if (opts::StaleMatchingWithBlockPseudoProbes) {
+    const MCPseudoProbeDecoder *PseudoProbeDecoder = BC.getPseudoProbeDecoder();
+    assert(PseudoProbeDecoder &&
+           "If pseudo probes are in use, pseudo probe decoder should exist");
+    const AddressProbesMap &ProbeMap =
+        PseudoProbeDecoder->getAddress2ProbesMap();
+    const uint64_t FuncAddr = BF.getAddress();
+    for (const MCDecodedPseudoProbe &Probe :
+         ProbeMap.find(FuncAddr, FuncAddr + BF.getSize()))
+      if (const BinaryBasicBlock *BB =
+              BF.getBasicBlockContainingOffset(Probe.getAddress() - FuncAddr))
+        Matcher.mapProbeToBB(&Probe, Blocks[BB->getIndex()]);
+    // Match inline tree nodes by GUID, checksum, parent, and call site.
+    unsigned MatchedNodes = 0;
+    const MCDecodedPseudoProbeInlineTree *DummyInlineRoot =
+        &PseudoProbeDecoder->getDummyInlineRoot();
+    for (const yaml::bolt::InlineTreeInfo &InlineTreeNode : YamlBF.InlineTree) {
+      uint64_t GUID = InlineTreeNode.GUID;
+      uint64_t Hash = InlineTreeNode.Hash;
+      uint32_t InlineTreeNodeId = InlineTreeNode.Index;
+      uint32_t ParentId = InlineTreeNode.ParentIndex;
+      uint32_t CallSiteProbe = InlineTreeNode.CallSiteProbe;
+      const MCDecodedPseudoProbeInlineTree *ParentNode =
+          InlineTreeNodeId ? Matcher.getInlineTreeNode(ParentId)
+                           : DummyInlineRoot;
+      if (!ParentNode)
+        continue;
+      for (const MCDecodedPseudoProbeInlineTree &Child :
+           ParentNode->getChildren()) {
+        if (Child.Guid != GUID ||
+            PseudoProbeDecoder->getFuncDescForGUID(GUID)->FuncHash != Hash)
+          continue;
+        // Check inline site for non-toplev inline tree nodes.
+        if (ParentNode != DummyInlineRoot &&
+            std::get<1>(Child.getInlineSite()) != CallSiteProbe)
+          continue;
+        Matcher.mapInlineTreeNode(InlineTreeNodeId, &Child);
+        ++MatchedNodes;
+        break;
+      }
+    }
+    LLVM_DEBUG(errs() << "matched " << MatchedNodes << "/"
+                      << YamlBF.InlineTree.size() << " inline tree nodes\n");
+  }
   Matcher.init(Blocks, BlendedHashes, CallHashes);
 
   // Index in yaml profile => corresponding (matched) block
@@ -724,7 +671,9 @@ size_t matchWeightsByHashes(
       else
         llvm_unreachable("Unhandled HashFunction");
     }
-    MatchedBlock = Matcher.matchBlock(YamlHash, CallHash, YamlBB.PseudoProbes);
+    StaleMatcher::MatchMethod Method;
+    std::tie(MatchedBlock, Method) = Matcher.matchBlock(
+        YamlHash, CallHash, YamlBB.PseudoProbes, YamlBF.InlineTree);
     if (MatchedBlock == nullptr && YamlBB.Index == 0)
       MatchedBlock = Blocks[0];
     if (MatchedBlock != nullptr) {
@@ -737,16 +686,34 @@ size_t matchWeightsByHashes(
                         << " with hash " << Twine::utohexstr(BinHash.combine())
                         << "\n");
       // Update matching stats accounting for the matched block.
-      if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
+      switch (Method) {
+      case StaleMatcher::MATCH_EXACT:
         ++BC.Stats.NumExactMatchedBlocks;
         BC.Stats.ExactMatchedSampleCount += YamlBB.ExecCount;
-        LLVM_DEBUG(dbgs() << "  exact match\n");
-      } else if (Matcher.isPseudoProbeMatch(YamlHash)) {
-        ++BC.Stats.NumPseudoProbeMatchedBlocks;
-        BC.Stats.PseudoProbeMatchedSampleCount += YamlBB.ExecCount;
-        LLVM_DEBUG(dbgs() << "  pseudo probe match\n");
-      } else {
-        LLVM_DEBUG(dbgs() << "  loose match\n");
+        LLVM_DEBUG(dbgs() << "  exact hash match\n");
+        break;
+      case StaleMatcher::MATCH_PROBE_EXACT:
+        ++BC.Stats.NumPseudoProbeExactMatchedBlocks;
+        BC.Stats.PseudoProbeExactMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  exact pseudo probe match\n");
+        break;
+      case StaleMatcher::MATCH_PROBE_LOOSE:
+        ++BC.Stats.NumPseudoProbeLooseMatchedBlocks;
+        BC.Stats.PseudoProbeLooseMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  loose pseudo probe match\n");
+        break;
+      case StaleMatcher::MATCH_CALL:
+        ++BC.Stats.NumCallMatchedBlocks;
+        BC.Stats.CallMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  call match\n");
+        break;
+      case StaleMatcher::MATCH_OPCODE:
+        ++BC.Stats.NumLooseMatchedBlocks;
+        BC.Stats.LooseMatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  loose hash match\n");
+        break;
+      case StaleMatcher::NO_MATCH:
+        LLVM_DEBUG(dbgs() << "  no match\n");
       }
       if (YamlBB.NumInstructions == BB->size())
         ++BC.Stats.NumStaleBlocksWithEqualIcount;
@@ -761,13 +728,6 @@ size_t matchWeightsByHashes(
     BC.Stats.StaleSampleCount += YamlBB.ExecCount;
   }
 
-  if (opts::Verbosity >= 2) {
-    outs() << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithPseudoProbes()
-           << " blocks matched with pseudo probes\n"
-           << "BOLT-INFO: " << Matcher.getNumBlocksMatchedWithOpcodes()
-           << " blocks matched with opcodes\n";
-  }
-
   // Match jumps from the profile to the jumps from CFG
   std::vector<uint64_t> OutWeight(Func.Blocks.size(), 0);
   std::vector<uint64_t> InWeight(Func.Blocks.size(), 0);
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 4b3f9ab4cb64a..43ab0d9fd63e5 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -51,6 +51,7 @@ static cl::opt<PrintPseudoProbesOptions> PrintPseudoProbes(
     cl::Hidden, cl::cat(BoltCategory));
 
 extern cl::opt<bool> ProfileWritePseudoProbes;
+extern cl::opt<bool> StaleMatchingWithBlockPseudoProbes;
 } // namespace opts
 
 namespace {
@@ -92,14 +93,15 @@ class PseudoProbeRewriter final : public MetadataRewriter {
 };
 
 Error PseudoProbeRewriter::preCFGInitializer() {
-  if (opts::ProfileWritePseudoProbes)
-    parsePseudoProbe(true);
+  if (opts::ProfileWritePseudoProbes ||
+      opts::StaleMatchingWithBlockPseudoProbes)
+    parsePseudoProbe(opts::ProfileWritePseudoProbes);
 
   return Error::success();
 }
 
 Error PseudoProbeRewriter::postEmitFinalizer() {
-  if (!opts::ProfileWritePseudoProbes)
+  if (!opts::StaleMatchingWithBlockPseudoProbes)
     parsePseudoProbe();
   updatePseudoProbes();
 

From 8fafc048b46e8076db353ea6a7eea7b2ebae48d7 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Wed, 4 Sep 2024 16:05:10 -0700
Subject: [PATCH 41/42] drop logIf

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 4f637c3e4f2eb..110769aa31e8e 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -331,19 +331,12 @@ class StaleMatcher {
     if (!opts::StaleMatchingWithBlockPseudoProbes)
       return {nullptr, false};
 
-    auto logIf = [](bool Expr, StringRef Message) {
-      LLVM_DEBUG(if (Expr) errs() << Message << '\n');
-      return Expr;
-    };
-
     DenseMap<const FlowBlock *, uint32_t> FlowBlockMatchCount;
 
     for (const yaml::bolt::PseudoProbeInfo &Probe : BlockPseudoProbes) {
       const MCDecodedPseudoProbeInlineTree *InlineTreeNode =
           getInlineTreeNode(Probe.InlineTreeIndex);
-      if (logIf(!InlineTreeNode,
-                formatv("no matching inline tree node for {0} {1}",
-                        Probe.InlineTreeIndex, Probe.Index).str())) {
+      if (!InlineTreeNode) {
         ++FlowBlockMatchCount[nullptr];
         continue;
       }
@@ -355,17 +348,12 @@ class StaleMatcher {
         BinaryProbe = &FuncProbe;
         break;
       }
-      if (logIf(!BinaryProbe, formatv("no matching binary probe for {0} {1}",
-                                      Probe.InlineTreeIndex, Probe.Index)
-                                  .str())) {
+      if (!BinaryProbe) {
         ++FlowBlockMatchCount[nullptr];
         continue;
       }
       auto It = BBPseudoProbeToBlock.find(BinaryProbe);
-      if (logIf(It == BBPseudoProbeToBlock.end(),
-                formatv("no probe->block for {0} {1}", Probe.InlineTreeIndex,
-                        Probe.Index)
-                    .str())) {
+      if (It == BBPseudoProbeToBlock.end()) {
         ++FlowBlockMatchCount[nullptr];
         continue;
       }
@@ -376,9 +364,6 @@ class StaleMatcher {
     uint32_t TotalMatchCount = 0;
     const FlowBlock *BestMatchBlock = nullptr;
     for (auto &[Block, Count] : FlowBlockMatchCount) {
-      logIf(true, formatv("block {0} count {1}",
-                          Block ? Block->Index : UINT64_MAX, Count)
-                      .str());
       TotalMatchCount += Count;
       if (Count > BestMatchCount ||
           (Count == BestMatchCount && !BestMatchBlock)) {

From ebd3acfb23808721059c73defc5a33965fbb57ff Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 26 Sep 2024 13:50:19 -0700
Subject: [PATCH 42/42] Allow null block participate in majority vote, improves
 run-time performance

Created using spr 1.3.4
---
 bolt/lib/Profile/StaleProfileMatching.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 4c60b9df875c5..ef70b979bad20 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -372,8 +372,6 @@ class StaleMatcher {
     const FlowBlock *BestMatchBlock = nullptr;
     for (auto &[FlowBlock, Count] : FlowBlockMatchCount) {
       TotalMatchCount += Count;
-      if (!FlowBlock)
-        continue;
       if (Count > BestMatchCount ||
           (Count == BestMatchCount && !BestMatchBlock)) {
         BestMatchBlock = FlowBlock;