diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index a300e5b2b1dab..813d825f8b570 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -2415,17 +2415,15 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         Fragments.insert(BF);
         for (const BinaryFunction *F : Fragments) {
           const uint64_t FuncAddr = F->getAddress();
-          const auto &FragmentProbes =
-              llvm::make_range(ProbeMap.lower_bound(FuncAddr),
-                               ProbeMap.lower_bound(FuncAddr + F->getSize()));
-          for (const auto &[OutputAddress, Probes] : FragmentProbes) {
+          for (const MCDecodedPseudoProbe &Probe :
+               ProbeMap.find(FuncAddr, FuncAddr + F->getSize())) {
+            const uint32_t OutputAddress = Probe.getAddress();
             const uint32_t InputOffset = BAT->translate(
                 FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true);
             const unsigned BlockIndex = getBlock(InputOffset).second;
-            for (const MCDecodedPseudoProbe &Probe : Probes)
-              YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
-                  yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
-                                              Probe.getType()});
+            YamlBF.Blocks[BlockIndex].PseudoProbes.emplace_back(
+                yaml::bolt::PseudoProbeInfo{Probe.getGuid(), Probe.getIndex(),
+                                            Probe.getType()});
           }
         }
       }
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 84777741d611a..f74cf60e076d0 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -193,13 +193,10 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
       const uint64_t FuncAddr = BF.getAddress();
       const std::pair<uint64_t, uint64_t> &BlockRange =
           BB->getInputAddressRange();
-      const auto &BlockProbes =
-          llvm::make_range(ProbeMap.lower_bound(FuncAddr + BlockRange.first),
-                           ProbeMap.lower_bound(FuncAddr + BlockRange.second));
-      for (const auto &[_, Probes] : BlockProbes)
-        for (const MCDecodedPseudoProbe &Probe : Probes)
-          YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
-              Probe.getGuid(), Probe.getIndex(), Probe.getType()});
+      for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(
+               FuncAddr + BlockRange.first, FuncAddr + BlockRange.second))
+        YamlBB.PseudoProbes.emplace_back(yaml::bolt::PseudoProbeInfo{
+            Probe.getGuid(), Probe.getIndex(), Probe.getType()});
     }
 
     YamlBF.Blocks.emplace_back(YamlBB);
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 886bbdbf9d686..4925b4b385d9b 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -143,7 +143,6 @@ void PseudoProbeRewriter::parsePseudoProbe() {
   if (!ProbeDecoder.buildAddress2ProbeMap(
           reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size(),
           GuidFilter, FuncStartAddrs)) {
-    ProbeDecoder.getAddress2ProbesMap().clear();
     errs() << "BOLT-WARNING: fail in building Address2ProbeMap\n";
     return;
   }
@@ -156,7 +155,8 @@ void PseudoProbeRewriter::parsePseudoProbe() {
     ProbeDecoder.printProbesForAllAddresses(outs());
   }
 
-  for (const auto &[GUID, FuncDesc] : ProbeDecoder.getGUID2FuncDescMap()) {
+  for (const auto &FuncDesc : ProbeDecoder.getGUID2FuncDescMap()) {
+    uint64_t GUID = FuncDesc.FuncGUID;
     if (!FuncStartAddrs.contains(GUID))
       continue;
     BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncStartAddrs[GUID]);
@@ -174,13 +174,13 @@ void PseudoProbeRewriter::updatePseudoProbes() {
   AddressProbesMap &Address2ProbesMap = ProbeDecoder.getAddress2ProbesMap();
   const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap();
 
-  for (auto &AP : Address2ProbesMap) {
-    BinaryFunction *F = BC.getBinaryFunctionContainingAddress(AP.first);
+  for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+    uint64_t Address = Probe.getAddress();
+    BinaryFunction *F = BC.getBinaryFunctionContainingAddress(Address);
     // If F is removed, eliminate all probes inside it from inline tree
     // Setting probes' addresses as INT64_MAX means elimination
     if (!F) {
-      for (MCDecodedPseudoProbe &Probe : AP.second)
-        Probe.setAddress(INT64_MAX);
+      Probe.setAddress(INT64_MAX);
       continue;
     }
     // If F is not emitted, the function will remain in the same address as its
@@ -188,45 +188,36 @@ void PseudoProbeRewriter::updatePseudoProbes() {
     if (!F->isEmitted())
       continue;
 
-    uint64_t Offset = AP.first - F->getAddress();
+    uint64_t Offset = Address - F->getAddress();
     const BinaryBasicBlock *BB = F->getBasicBlockContainingOffset(Offset);
     uint64_t BlkOutputAddress = BB->getOutputAddressRange().first;
     // Check if block output address is defined.
     // If not, such block is removed from binary. Then remove the probes from
     // inline tree
     if (BlkOutputAddress == 0) {
-      for (MCDecodedPseudoProbe &Probe : AP.second)
-        Probe.setAddress(INT64_MAX);
+      Probe.setAddress(INT64_MAX);
       continue;
     }
 
-    unsigned ProbeTrack = AP.second.size();
-    std::list<MCDecodedPseudoProbe>::iterator Probe = AP.second.begin();
-    while (ProbeTrack != 0) {
-      if (Probe->isBlock()) {
-        Probe->setAddress(BlkOutputAddress);
-      } else if (Probe->isCall()) {
-        // A call probe may be duplicated due to ICP
-        // Go through output of InputOffsetToAddressMap to collect all related
-        // probes
-        auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(AP.first);
-        auto CallOutputAddress = CallOutputAddresses.first;
-        if (CallOutputAddress == CallOutputAddresses.second) {
-          Probe->setAddress(INT64_MAX);
-        } else {
-          Probe->setAddress(CallOutputAddress->second);
-          CallOutputAddress = std::next(CallOutputAddress);
-        }
-
-        while (CallOutputAddress != CallOutputAddresses.second) {
-          AP.second.push_back(*Probe);
-          AP.second.back().setAddress(CallOutputAddress->second);
-          Probe->getInlineTreeNode()->addProbes(&(AP.second.back()));
-          CallOutputAddress = std::next(CallOutputAddress);
-        }
+    if (Probe.isBlock()) {
+      Probe.setAddress(BlkOutputAddress);
+    } else if (Probe.isCall()) {
+      // A call probe may be duplicated due to ICP
+      // Go through output of InputOffsetToAddressMap to collect all related
+      // probes
+      auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(Address);
+      auto CallOutputAddress = CallOutputAddresses.first;
+      if (CallOutputAddress == CallOutputAddresses.second) {
+        Probe.setAddress(INT64_MAX);
+      } else {
+        Probe.setAddress(CallOutputAddress->second);
+        CallOutputAddress = std::next(CallOutputAddress);
+      }
+
+      while (CallOutputAddress != CallOutputAddresses.second) {
+        ProbeDecoder.addInjectedProbe(Probe, CallOutputAddress->second);
+        CallOutputAddress = std::next(CallOutputAddress);
       }
-      Probe = std::next(Probe);
-      ProbeTrack--;
     }
   }
 
@@ -242,22 +233,16 @@ void PseudoProbeRewriter::updatePseudoProbes() {
             BinaryBlock.getName();
 
     // scan all addresses -> correlate probe to block when print out
-    std::vector<uint64_t> Addresses;
-    for (auto &Entry : Address2ProbesMap)
-      Addresses.push_back(Entry.first);
-    llvm::sort(Addresses);
-    for (uint64_t Key : Addresses) {
-      for (MCDecodedPseudoProbe &Probe : Address2ProbesMap[Key]) {
-        if (Probe.getAddress() == INT64_MAX)
-          outs() << "Deleted Probe: ";
-        else
-          outs() << "Address: " << format_hex(Probe.getAddress(), 8) << " ";
-        Probe.print(outs(), GUID2Func, true);
-        // print block name only if the probe is block type and undeleted.
-        if (Probe.isBlock() && Probe.getAddress() != INT64_MAX)
-          outs() << format_hex(Probe.getAddress(), 8) << " Probe is in "
-                 << Addr2BlockNames[Probe.getAddress()] << "\n";
-      }
+    for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+      if (Probe.getAddress() == INT64_MAX)
+        outs() << "Deleted Probe: ";
+      else
+        outs() << "Address: " << format_hex(Probe.getAddress(), 8) << " ";
+      Probe.print(outs(), GUID2Func, true);
+      // print block name only if the probe is block type and undeleted.
+      if (Probe.isBlock() && Probe.getAddress() != INT64_MAX)
+        outs() << format_hex(Probe.getAddress(), 8) << " Probe is in "
+               << Addr2BlockNames[Probe.getAddress()] << "\n";
     }
     outs() << "=======================================\n";
   }
@@ -333,7 +318,7 @@ void PseudoProbeRewriter::encodePseudoProbes() {
       ProbeDecoder.getDummyInlineRoot();
   for (auto Child = Root.getChildren().begin();
        Child != Root.getChildren().end(); ++Child)
-    Inlinees[Child->first] = Child->second.get();
+    Inlinees[Child->getInlineSite()] = &*Child;
 
   for (auto Inlinee : Inlinees)
     // INT64_MAX is "placeholder" of unused callsite index field in the pair
@@ -359,25 +344,37 @@ void PseudoProbeRewriter::encodePseudoProbes() {
     EmitInt(Cur->Guid, 8);
     // Emit number of probes in this node
     uint64_t Deleted = 0;
-    for (MCDecodedPseudoProbe *&Probe : Cur->getProbes())
+    for (MCDecodedPseudoProbe *&Probe :
+         llvm::make_pointer_range(Cur->getProbes()))
       if (Probe->getAddress() == INT64_MAX)
         Deleted++;
     LLVM_DEBUG(dbgs() << "Deleted Probes:" << Deleted << "\n");
-    uint64_t ProbesSize = Cur->getProbes().size() - Deleted;
+    size_t InjectedProbes = ProbeDecoder.getNumInjectedProbes(Cur);
+    uint64_t ProbesSize = Cur->getProbes().size() - Deleted + InjectedProbes;
     EmitULEB128IntValue(ProbesSize);
     // Emit number of direct inlinees
     EmitULEB128IntValue(Cur->getChildren().size());
     // Emit probes in this group
-    for (MCDecodedPseudoProbe *&Probe : Cur->getProbes()) {
+    for (MCDecodedPseudoProbe *&Probe :
+         llvm::make_pointer_range(Cur->getProbes())) {
       if (Probe->getAddress() == INT64_MAX)
         continue;
       EmitDecodedPseudoProbe(Probe);
       LastProbe = Probe;
     }
+    if (InjectedProbes) {
+      for (MCDecodedPseudoProbe *&Probe :
+           llvm::make_pointer_range(ProbeDecoder.getInjectedProbes(Cur))) {
+        if (Probe->getAddress() == INT64_MAX)
+          continue;
+        EmitDecodedPseudoProbe(Probe);
+        LastProbe = Probe;
+      }
+    }
 
     for (auto Child = Cur->getChildren().begin();
          Child != Cur->getChildren().end(); ++Child)
-      Inlinees[Child->first] = Child->second.get();
+      Inlinees[Child->getInlineSite()] = &*Child;
     for (const auto &Inlinee : Inlinees) {
       assert(Cur->Guid != 0 && "non root tree node must have nonzero Guid");
       NextNodes.push_back({std::get<1>(Inlinee.first), Inlinee.second});
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index aa115cd450c4f..968a4a55a6d79 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -66,7 +66,8 @@ bool isUnaryLogicalNotOperator(const Stmt *Statement) {
 
 void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
                               const ImplicitCastExpr *Cast, const Stmt *Parent,
-                              ASTContext &Context) {
+                              ASTContext &Context,
+                              bool UseUpperCaseLiteralSuffix) {
   // In case of expressions like (! integer), we should remove the redundant not
   // operator and use inverted comparison (integer == 0).
   bool InvertComparison =
@@ -112,9 +113,14 @@ void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
     EndLocInsertion += " != ";
   }
 
-  EndLocInsertion += getZeroLiteralToCompareWithForType(
+  const StringRef ZeroLiteral = getZeroLiteralToCompareWithForType(
       Cast->getCastKind(), SubExpr->getType(), Context);
 
+  if (UseUpperCaseLiteralSuffix)
+    EndLocInsertion += ZeroLiteral.upper();
+  else
+    EndLocInsertion += ZeroLiteral;
+
   if (NeedOuterParens) {
     EndLocInsertion += ")";
   }
@@ -248,12 +254,15 @@ ImplicitBoolConversionCheck::ImplicitBoolConversionCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       AllowIntegerConditions(Options.get("AllowIntegerConditions", false)),
-      AllowPointerConditions(Options.get("AllowPointerConditions", false)) {}
+      AllowPointerConditions(Options.get("AllowPointerConditions", false)),
+      UseUpperCaseLiteralSuffix(
+          Options.get("UseUpperCaseLiteralSuffix", false)) {}
 
 void ImplicitBoolConversionCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "AllowIntegerConditions", AllowIntegerConditions);
   Options.store(Opts, "AllowPointerConditions", AllowPointerConditions);
+  Options.store(Opts, "UseUpperCaseLiteralSuffix", UseUpperCaseLiteralSuffix);
 }
 
 void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
@@ -378,7 +387,8 @@ void ImplicitBoolConversionCheck::handleCastToBool(const ImplicitCastExpr *Cast,
   if (!EquivalentLiteral.empty()) {
     Diag << tooling::fixit::createReplacement(*Cast, EquivalentLiteral);
   } else {
-    fixGenericExprCastToBool(Diag, Cast, Parent, Context);
+    fixGenericExprCastToBool(Diag, Cast, Parent, Context,
+                             UseUpperCaseLiteralSuffix);
   }
 }
 
@@ -392,8 +402,16 @@ void ImplicitBoolConversionCheck::handleCastFromBool(
 
   if (const auto *BoolLiteral =
           dyn_cast<CXXBoolLiteralExpr>(Cast->getSubExpr()->IgnoreParens())) {
-    Diag << tooling::fixit::createReplacement(
-        *Cast, getEquivalentForBoolLiteral(BoolLiteral, DestType, Context));
+
+    const auto EquivalentForBoolLiteral =
+        getEquivalentForBoolLiteral(BoolLiteral, DestType, Context);
+    if (UseUpperCaseLiteralSuffix)
+      Diag << tooling::fixit::createReplacement(
+          *Cast, EquivalentForBoolLiteral.upper());
+    else
+      Diag << tooling::fixit::createReplacement(*Cast,
+                                                EquivalentForBoolLiteral);
+
   } else {
     fixGenericExprCastFromBool(Diag, Cast, Context, DestType.getAsString());
   }
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
index 9defec91e2f78..5947f7316e67c 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h
@@ -36,6 +36,7 @@ class ImplicitBoolConversionCheck : public ClangTidyCheck {
 
   const bool AllowIntegerConditions;
   const bool AllowPointerConditions;
+  const bool UseUpperCaseLiteralSuffix;
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 1b025e8f90f7b..b001a6ad44669 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -112,6 +112,11 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-std-print>` check to support replacing
   member function calls too.
 
+- Improved :doc:`readablility-implicit-bool-conversion
+  <clang-tidy/checks/readability/implicit-bool-conversion>` check
+  by adding the option `UseUpperCaseLiteralSuffix` to select the
+  case of the literal suffix in fixes.
+
 - Improved :doc:`readability-redundant-smartptr-get
   <clang-tidy/checks/readability/redundant-smartptr-get>` check to
   remove `->`, when redundant `get()` is removed.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
index 1ab21ffeb4228..88cff387f4c16 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
@@ -133,3 +133,17 @@ Options
 
    When `true`, the check will allow conditional pointer conversions. Default
    is `false`.
+
+.. option::  UseUpperCaseLiteralSuffix
+
+   When `true`, the replacements will use an uppercase literal suffix in the
+   provided fixes. Default is `false`.
+
+    Example
+
+    .. code-block:: c++
+
+      uint32_t foo;
+      if (foo) {}
+      // ^ propose replacement default: if (foo != 0u) {}
+      // ^ propose replacement with option `UseUpperCaseLiteralSuffix`: if (foo != 0U) {}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c
index a8c69858f76b6..f3dc32c10d640 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c
@@ -1,4 +1,8 @@
 // RUN: %check_clang_tidy %s readability-implicit-bool-conversion %t -- -- -std=c23
+// RUN: %check_clang_tidy -check-suffix=UPPER-CASE %s readability-implicit-bool-conversion %t -- \
+// RUN:     -config='{CheckOptions: { \
+// RUN:         readability-implicit-bool-conversion.UseUpperCaseLiteralSuffix: true \
+// RUN:     }}' -- -std=c23
 
 #undef NULL
 #define NULL 0L
@@ -95,6 +99,7 @@ void implicitConversionFromBoolLiterals() {
   functionTakingUnsignedLong(false);
   // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: implicit conversion 'bool' -> 'unsigned long'
   // CHECK-FIXES: functionTakingUnsignedLong(0u);
+  // CHECK-FIXES-UPPER-CASE: functionTakingUnsignedLong(0U);
 
   functionTakingSignedChar(true);
   // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: implicit conversion 'bool' -> 'signed char'
@@ -103,6 +108,7 @@ void implicitConversionFromBoolLiterals() {
   functionTakingFloat(false);
   // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: implicit conversion 'bool' -> 'float'
   // CHECK-FIXES: functionTakingFloat(0.0f);
+  // CHECK-FIXES-UPPER-CASE: functionTakingFloat(0.0F);
 
   functionTakingDouble(true);
   // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'bool' -> 'double'
@@ -160,11 +166,13 @@ void implicitConversionToBoolSimpleCases() {
   functionTakingBool(unsignedLong);
   // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'unsigned long' -> 'bool'
   // CHECK-FIXES: functionTakingBool(unsignedLong != 0u);
+  // CHECK-FIXES-UPPER-CASE: functionTakingBool(unsignedLong != 0U);
 
   float floating = 0.0f;
   functionTakingBool(floating);
   // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: functionTakingBool(floating != 0.0f);
+  // CHECK-FIXES-UPPER-CASE: functionTakingBool(floating != 0.0F);
 
   double doubleFloating = 1.0f;
   functionTakingBool(doubleFloating);
@@ -194,6 +202,7 @@ void implicitConversionToBoolInSingleExpressions() {
   boolComingFromFloat = floating;
   // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: boolComingFromFloat = (floating != 0.0f);
+  // CHECK-FIXES-UPPER-CASE: boolComingFromFloat = (floating != 0.0F);
 
   signed char character = 'a';
   bool boolComingFromChar;
@@ -288,6 +297,7 @@ void implicitConversionToBoolFromUnaryMinusAndZeroLiterals() {
   functionTakingBool(-0.0f);
   // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: functionTakingBool((-0.0f) != 0.0f);
+  // CHECK-FIXES-UPPER-CASE: functionTakingBool((-0.0f) != 0.0F);
 
   functionTakingBool(-0.0);
   // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'double' -> 'bool'
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp
index d6e7dcc4d8867..c4b7a77b92f0a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp
@@ -1,4 +1,8 @@
 // RUN: %check_clang_tidy %s readability-implicit-bool-conversion %t
+// RUN: %check_clang_tidy -check-suffix=UPPER-CASE %s readability-implicit-bool-conversion %t -- \
+// RUN:     -config='{CheckOptions: { \
+// RUN:         readability-implicit-bool-conversion.UseUpperCaseLiteralSuffix: true \
+// RUN:     }}'
 
 // We need NULL macro, but some buildbots don't like including <cstddef> header
 // This is a portable way of getting it to work
@@ -99,6 +103,7 @@ void implicitConversionFromBoolLiterals() {
   functionTaking<unsigned long>(false);
   // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: implicit conversion 'bool' -> 'unsigned long'
   // CHECK-FIXES: functionTaking<unsigned long>(0u);
+  // CHECK-FIXES-UPPER-CASE: functionTaking<unsigned long>(0U);
 
   functionTaking<signed char>(true);
   // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: implicit conversion 'bool' -> 'signed char'
@@ -107,6 +112,7 @@ void implicitConversionFromBoolLiterals() {
   functionTaking<float>(false);
   // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: implicit conversion 'bool' -> 'float'
   // CHECK-FIXES: functionTaking<float>(0.0f);
+  // CHECK-FIXES-UPPER-CASE: functionTaking<float>(0.0F);
 
   functionTaking<double>(true);
   // CHECK-MESSAGES: :[[@LINE-1]]:26: warning: implicit conversion 'bool' -> 'double'
@@ -178,11 +184,13 @@ void implicitConversionToBoolSimpleCases() {
   functionTaking<bool>(unsignedLong);
   // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'unsigned long' -> 'bool'
   // CHECK-FIXES: functionTaking<bool>(unsignedLong != 0u);
+  // CHECK-FIXES-UPPER-CASE: functionTaking<bool>(unsignedLong != 0U);
 
   float floating = 0.0f;
   functionTaking<bool>(floating);
   // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: functionTaking<bool>(floating != 0.0f);
+  // CHECK-FIXES-UPPER-CASE: functionTaking<bool>(floating != 0.0F);
 
   double doubleFloating = 1.0f;
   functionTaking<bool>(doubleFloating);
@@ -215,6 +223,7 @@ void implicitConversionToBoolInSingleExpressions() {
   bool boolComingFromFloat = floating;
   // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: bool boolComingFromFloat = floating != 0.0f;
+  // CHECK-FIXES-UPPER-CASE: bool boolComingFromFloat = floating != 0.0F;
 
   signed char character = 'a';
   bool boolComingFromChar = character;
@@ -240,6 +249,7 @@ void implicitConversionToBoolInComplexExpressions() {
   bool boolComingFromFloating = floating - 0.3f || boolean;
   // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: bool boolComingFromFloating = ((floating - 0.3f) != 0.0f) || boolean;
+  // CHECK-FIXES-UPPER-CASE: bool boolComingFromFloating = ((floating - 0.3f) != 0.0F) || boolean;
 
   double doubleFloating = 0.3;
   bool boolComingFromDoubleFloating = (doubleFloating - 0.4) && boolean;
@@ -257,6 +267,7 @@ void implicitConversionInNegationExpressions() {
   bool boolComingFromNegatedFloat = ! floating;
   // CHECK-MESSAGES: :[[@LINE-1]]:39: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: bool boolComingFromNegatedFloat = floating == 0.0f;
+  // CHECK-FIXES-UPPER-CASE: bool boolComingFromNegatedFloat = floating == 0.0F;
 
   signed char character = 'a';
   bool boolComingFromNegatedChar = (! character);
@@ -284,6 +295,7 @@ void implicitConversionToBoolInControlStatements() {
   while (floating) {}
   // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: while (floating != 0.0f) {}
+  // CHECK-FIXES-UPPER-CASE: while (floating != 0.0F) {}
 
   double doubleFloating = 0.4;
   do {} while (doubleFloating);
@@ -296,6 +308,7 @@ bool implicitConversionToBoolInReturnValue() {
   return floating;
   // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: return floating != 0.0f;
+  // CHECK-FIXES-UPPER-CASE: return floating != 0.0F;
 }
 
 void implicitConversionToBoolFromLiterals() {
@@ -355,6 +368,7 @@ void implicitConversionToBoolFromUnaryMinusAndZeroLiterals() {
   functionTaking<bool>(-0.0f);
   // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'float' -> 'bool'
   // CHECK-FIXES: functionTaking<bool>((-0.0f) != 0.0f);
+  // CHECK-FIXES-UPPER-CASE: functionTaking<bool>((-0.0f) != 0.0F);
 
   functionTaking<bool>(-0.0);
   // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'double' -> 'bool'
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2c6c7e083b9c9..2c29d49ba20f0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -173,6 +173,13 @@ Non-comprehensive list of changes in this release
 New Compiler Flags
 ------------------
 
+- The ``-fc++-static-destructors={all,thread-local,none}`` flag was
+  added to control which C++ variables have static destructors
+  registered: all (the default) does so for all variables, thread-local
+  only for thread-local variables, and none (which corresponds to the
+  existing ``-fno-c++-static-destructors`` flag) skips all static
+  destructors registration.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -309,6 +316,8 @@ Bug Fixes to C++ Support
   template depth than the friend function template. (#GH98258)
 - Clang now rebuilds the template parameters of out-of-line declarations and specializations in the context
   of the current instantiation in all cases.
+- Fix evaluation of the index of dependent pack indexing expressions/types specifiers (#GH105900)
+- Correctly handle subexpressions of an immediate invocation in the presence of implicit casts. (#GH105558)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 0bfbc995579d4..89a1018e14c0e 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -571,7 +571,7 @@ if ((y = make_int())) {
 nullability
 ^^^^^^^^^^^
 
-Objective C checkers that warn for null pointer passing and dereferencing errors.
+Checkers (mostly Objective C) that warn for null pointer passing and dereferencing errors.
 
 .. _nullability-NullPassedToNonnull:
 
@@ -588,8 +588,8 @@ Warns when a null pointer is passed to a pointer which has a _Nonnull type.
 
 .. _nullability-NullReturnedFromNonnull:
 
-nullability.NullReturnedFromNonnull (ObjC)
-""""""""""""""""""""""""""""""""""""""""""
+nullability.NullReturnedFromNonnull (C, C++, ObjC)
+""""""""""""""""""""""""""""""""""""""""""""""""""
 Warns when a null pointer is returned from a function that has _Nonnull return type.
 
 .. code-block:: objc
@@ -604,6 +604,22 @@ Warns when a null pointer is returned from a function that has _Nonnull return t
    return result;
  }
 
+Warns when a null pointer is returned from a function annotated with ``__attribute__((returns_nonnull))``
+
+.. code-block:: cpp
+
+ int global;
+ __attribute__((returns_nonnull)) void* getPtr(void* p);
+
+ void* getPtr(void* p) {
+   if (p) { // forgot to negate the condition
+     return &global;
+   }
+   // Warning: nullptr returned from a function that is expected
+   // to return a non-null value
+   return p;
+ }
+
 .. _nullability-NullableDereferenced:
 
 nullability.NullableDereferenced (ObjC)
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index df36a2163b9f0..c2b9d7cb93c30 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1592,6 +1592,12 @@ succeeds but Clang emits a warning specifying that the function is deprecated.
 Finally, if Clang is instructed to compile code for macOS 10.7, the call
 fails because ``f()`` is no longer available.
 
+Clang is instructed to compile code for a minimum deployment version using
+the ``-target`` or ``-mtargetos`` command line arguments. For example,
+macOS 10.7 would be specified as ``-target x86_64-apple-macos10.7`` or
+``-mtargetos=macos10.7``. Variants like Mac Catalyst are specified as
+``-target arm64-apple-ios15.0-macabi`` or ``-mtargetos=ios15.0-macabi``
+
 The availability attribute is a comma-separated list starting with the
 platform name and then including clauses specifying important milestones in the
 declaration's lifetime (in any order) along with additional information. Those
@@ -1636,41 +1642,61 @@ the implicitly inferred availability attributes. If no availability attribute
 specifies availability for the current target platform, the availability
 attributes are ignored. Supported platforms are:
 
-``ios``
-  Apple's iOS operating system. The minimum deployment target is specified
-  as part of the ``-target *arch*-apple-ios*version*`` command line argument.
-  Alternatively, it can be specified by the ``-mtargetos=ios*version*``
-  command-line argument.
+``iOS``
+``macOS``
+``tvOS``
+``watchOS``
+``iOSApplicationExtension``
+``macOSApplicationExtension``
+``tvOSApplicationExtension``
+``watchOSApplicationExtension``
+``macCatalyst``
+``macCatalystApplicationExtension``
+``visionOS``
+``visionOSApplicationExtension``
+``driverkit``
+``swift``
+``android``
+``fuchsia``
+``ohos``
+``zos``
+``ShaderModel``
 
-``macos``
-  Apple's macOS operating system. The minimum deployment target is specified
-  as part of the ``-target *arch*-apple-macos*version*`` command line argument.
-  Alternatively, it can be specified by the ``-mtargetos=macos*version*``
-  command-line argument. ``macosx`` is supported for
-  backward-compatibility reasons, but it is deprecated.
+Some platforms have alias names:
 
+``ios``
+``macos``
+``macosx (deprecated)``
 ``tvos``
-  Apple's tvOS operating system. The minimum deployment target is specified
-  as part of the ``-target *arch*-apple-tvos*version*`` command line argument.
-  Alternatively, it can be specified by the ``-mtargetos=tvos*version*``
-  command-line argument.
-
 ``watchos``
-  Apple's watchOS operating system. The minimum deployment target is specified
-  as part of the ``-target *arch*-apple-watchos*version*`` command line argument.
-  Alternatively, it can be specified by the ``-mtargetos=watchos*version*``
-  command-line argument.
-
+``ios_app_extension``
+``macos_app_extension``
+``macosx_app_extension (deprecated)``
+``tvos_app_extension``
+``watchos_app_extension``
+``maccatalyst``
+``maccatalyst_app_extension``
 ``visionos``
-  Apple's visionOS operating system. The minimum deployment target is specified
-  as part of the ``-target *arch*-apple-visionos*version*`` command line argument.
-  Alternatively, it can be specified by the ``-mtargetos=visionos*version*``
-  command-line argument.
-
-``driverkit``
-  Apple's DriverKit userspace kernel extensions. The minimum deployment target
-  is specified as part of the ``-target *arch*-apple-driverkit*version*``
-  command line argument.
+``visionos_app_extension``
+``shadermodel``
+
+Supported environment names for the ShaderModel platform:
+
+``pixel``
+``vertex``
+``geometry``
+``hull``
+``domain``
+``compute``
+``raygeneration``
+``intersection``
+``anyhit``
+``closesthit``
+``miss``
+``callable``
+``mesh``
+``amplification``
+``library``
 
 A declaration can typically be used even when deploying back to a platform
 version prior to when the declaration was introduced. When this happens, the
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 956d9a2d2434c..fd3346d29f26a 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -464,7 +464,9 @@ LANGOPT(FixedPoint, 1, 0, "fixed point types")
 LANGOPT(PaddingOnUnsignedFixedPoint, 1, 0,
         "unsigned fixed point types having one extra padding bit")
 
-LANGOPT(RegisterStaticDestructors, 1, 1, "Register C++ static destructors")
+ENUM_LANGOPT(RegisterStaticDestructors, RegisterStaticDestructorsKind, 2,
+             RegisterStaticDestructorsKind::All,
+             "Register C++ static destructors")
 
 LANGOPT(RegCall4, 1, 0, "Set __regcall4 as a default calling convention to respect __regcall ABI v.4")
 
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 1c80ee89837cb..51a34686ad7e1 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -458,6 +458,16 @@ class LangOptionsBase {
     CX_None
   };
 
+  /// Controls which variables have static destructors registered.
+  enum class RegisterStaticDestructorsKind {
+    /// Register static destructors for all variables.
+    All,
+    /// Register static destructors only for thread-local variables.
+    ThreadLocal,
+    /// Don't register static destructors for any variables.
+    None,
+  };
+
   // Define simple language options (with no accessors).
 #define LANGOPT(Name, Bits, Default, Description) unsigned Name : Bits;
 #define ENUM_LANGOPT(Name, Type, Bits, Default, Description)
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index d683106bb0e29..212c1f6ff3a12 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -660,6 +660,9 @@ KEYWORD(out                         , KEYHLSL)
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) KEYWORD(Name, KEYHLSL)
 #include "clang/Basic/HLSLIntangibleTypes.def"
 
+// HLSL Type traits.
+TYPE_TRAIT_2(__builtin_hlsl_is_scalarized_layout_compatible, IsScalarizedLayoutCompatible, KEYHLSL)
+
 // OpenMP Type Traits
 UNARY_EXPR_OR_TYPE_TRAIT(__builtin_omp_required_simd_align, OpenMPRequiredSimdAlign, KEYALL)
 
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 84eadd42880a5..9177d56718ee7 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -379,8 +379,7 @@ class Driver {
 
   /// Takes the path to a binary that's either in bin/ or lib/ and returns
   /// the path to clang's resource directory.
-  static std::string GetResourcesPath(StringRef BinaryPath,
-                                      StringRef CustomResourceDir = "");
+  static std::string GetResourcesPath(StringRef BinaryPath);
 
   Driver(StringRef ClangExecutable, StringRef TargetTriple,
          DiagnosticsEngine &Diags, std::string Title = "clang LLVM compiler",
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1b9b3f2c6600a..83cf753e82484 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2302,11 +2302,18 @@ defm fixed_point : BoolFOption<"fixed-point",
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
   NegFlag<SetFalse, [], [ClangOption], "Disable">,
   BothFlags<[], [ClangOption], " fixed point types">>;
-defm cxx_static_destructors : BoolFOption<"c++-static-destructors",
-  LangOpts<"RegisterStaticDestructors">, DefaultTrue,
-  NegFlag<SetFalse, [], [ClangOption, CC1Option],
-          "Disable C++ static destructor registration">,
-  PosFlag<SetTrue>>;
+def cxx_static_destructors_EQ : Joined<["-"], "fc++-static-destructors=">, Group<f_Group>,
+  HelpText<"Controls which variables C++ static destructors are registered for">,
+  Values<"all,thread-local,none">,
+  NormalizedValues<["All", "ThreadLocal", "None"]>,
+  NormalizedValuesScope<"LangOptions::RegisterStaticDestructorsKind">,
+  MarshallingInfoEnum<LangOpts<"RegisterStaticDestructors">, "All">,
+  Visibility<[ClangOption, CC1Option]>;
+def cxx_static_destructors : Flag<["-"], "fc++-static-destructors">, Group<f_Group>,
+  Alias<cxx_static_destructors_EQ>, AliasArgs<["all"]>;
+def no_cxx_static_destructors : Flag<["-"], "fno-c++-static-destructors">, Group<f_Group>,
+  Alias<cxx_static_destructors_EQ>, AliasArgs<["none"]>,
+  HelpText<"Disable C++ static destructor registration">;
 def fsymbol_partition_EQ : Joined<["-"], "fsymbol-partition=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
   MarshallingInfoString<CodeGenOpts<"SymbolPartition">>;
diff --git a/clang/include/clang/ExtractAPI/DeclarationFragments.h b/clang/include/clang/ExtractAPI/DeclarationFragments.h
index 535da90b98284..4ac744459031e 100644
--- a/clang/include/clang/ExtractAPI/DeclarationFragments.h
+++ b/clang/include/clang/ExtractAPI/DeclarationFragments.h
@@ -411,9 +411,9 @@ class DeclarationFragmentsBuilder {
   /// Build DeclarationFragments for a macro.
   ///
   /// \param Name name of the macro.
-  /// \param MD the associated MacroDirective.
+  /// \param MI the associated MacroInfo.
   static DeclarationFragments getFragmentsForMacro(StringRef Name,
-                                                   const MacroDirective *MD);
+                                                   const MacroInfo *MI);
 
   /// Build DeclarationFragments for a typedef \p TypedefNameDecl.
   static DeclarationFragments
diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
index 67659f5a25037..b09b8b44d9aba 100644
--- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
+++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
@@ -213,7 +213,7 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor<Derived> {
 
   StringRef getOwningModuleName(const Decl &D) {
     if (auto *OwningModule = D.getImportedOwningModule())
-      return OwningModule->Name;
+      return OwningModule->getTopLevelModule()->Name;
 
     return {};
   }
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 3aae3383c215b..5277fb57a2334 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -61,6 +61,9 @@ class SemaHLSL : public SemaBase {
   void handleParamModifierAttr(Decl *D, const ParsedAttr &AL);
 
   bool CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
+
+  // HLSL Type trait implementations
+  bool IsScalarizedLayoutCompatible(QualType T1, QualType T2) const;
 };
 
 } // namespace clang
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp
index 53ec8f52d4921..3b9e5f9f9f69c 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.cpp
+++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp
@@ -219,7 +219,7 @@ bool EvalEmitter::emitRetValue(const SourceInfo &Info) {
     return false;
 
   if (std::optional<APValue> APV =
-          Ptr.toRValue(S.getCtx(), EvalResult.getSourceType())) {
+          Ptr.toRValue(S.getASTContext(), EvalResult.getSourceType())) {
     EvalResult.setValue(*APV);
     return true;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index aea303f0e630c..09d3f4525138e 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -326,7 +326,7 @@ bool CheckConstant(InterpState &S, CodePtr OpPC, const Descriptor *Desc) {
   auto IsConstType = [&S](const VarDecl *VD) -> bool {
     QualType T = VD->getType();
 
-    if (T.isConstant(S.getCtx()))
+    if (T.isConstant(S.getASTContext()))
       return true;
 
     if (S.getLangOpts().CPlusPlus && !S.getLangOpts().CPlusPlus11)
@@ -523,9 +523,9 @@ bool CheckGlobalInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   assert(S.getLangOpts().CPlusPlus);
   const auto *VD = cast<VarDecl>(Ptr.getDeclDesc()->asValueDecl());
   if ((!VD->hasConstantInitialization() &&
-       VD->mightBeUsableInConstantExpressions(S.getCtx())) ||
+       VD->mightBeUsableInConstantExpressions(S.getASTContext())) ||
       (S.getLangOpts().OpenCL && !S.getLangOpts().CPlusPlus11 &&
-       !VD->hasICEInitializer(S.getCtx()))) {
+       !VD->hasICEInitializer(S.getASTContext()))) {
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     S.FFDiag(Loc, diag::note_constexpr_var_init_non_constant, 1) << VD;
     S.Note(VD->getLocation(), diag::note_declared_at);
@@ -797,7 +797,7 @@ bool CheckNewDeleteForms(InterpState &S, CodePtr OpPC, bool NewWasArray,
   // but we want to get the array size right.
   if (D->isArray()) {
     QualType ElemQT = D->getType()->getPointeeType();
-    TypeToDiagnose = S.getCtx().getConstantArrayType(
+    TypeToDiagnose = S.getASTContext().getConstantArrayType(
         ElemQT, APInt(64, static_cast<uint64_t>(D->getNumElems()), false),
         nullptr, ArraySizeModifier::Normal, 0);
   } else
@@ -819,7 +819,7 @@ bool CheckDeleteSource(InterpState &S, CodePtr OpPC, const Expr *Source,
   // Whatever this is, we didn't heap allocate it.
   const SourceInfo &Loc = S.Current->getSource(OpPC);
   S.FFDiag(Loc, diag::note_constexpr_delete_not_heap_alloc)
-      << Ptr.toDiagnosticString(S.getCtx());
+      << Ptr.toDiagnosticString(S.getASTContext());
 
   if (Ptr.isTemporary())
     S.Note(Ptr.getDeclLoc(), diag::note_constexpr_temporary_here);
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 81c547991c3d7..242532a3f0544 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -41,7 +41,7 @@ using APSInt = llvm::APSInt;
 /// Convert a value to an APValue.
 template <typename T>
 bool ReturnValue(const InterpState &S, const T &V, APValue &R) {
-  R = V.toAPValue(S.getCtx());
+  R = V.toAPValue(S.getASTContext());
   return true;
 }
 
@@ -231,12 +231,12 @@ bool CheckArraySize(InterpState &S, CodePtr OpPC, SizeT *NumElements,
   // constructing the array, we catch this here.
   SizeT MaxElements = SizeT::from(Descriptor::MaxArrayElemBytes / ElemSize);
   if (NumElements->toAPSInt().getActiveBits() >
-          ConstantArrayType::getMaxSizeBits(S.getCtx()) ||
+          ConstantArrayType::getMaxSizeBits(S.getASTContext()) ||
       *NumElements > MaxElements) {
     if (!IsNoThrow) {
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.FFDiag(Loc, diag::note_constexpr_new_too_large)
-          << NumElements->toDiagnosticString(S.getCtx());
+          << NumElements->toDiagnosticString(S.getASTContext());
     }
     return false;
   }
@@ -911,8 +911,8 @@ inline bool CmpHelper<FunctionPointer>(InterpState &S, CodePtr OpPC,
 
   const SourceInfo &Loc = S.Current->getSource(OpPC);
   S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified)
-      << LHS.toDiagnosticString(S.getCtx())
-      << RHS.toDiagnosticString(S.getCtx());
+      << LHS.toDiagnosticString(S.getASTContext())
+      << RHS.toDiagnosticString(S.getASTContext());
   return false;
 }
 
@@ -927,7 +927,7 @@ inline bool CmpHelperEQ<FunctionPointer>(InterpState &S, CodePtr OpPC,
     if (FP.isWeak()) {
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.FFDiag(Loc, diag::note_constexpr_pointer_weak_comparison)
-          << FP.toDiagnosticString(S.getCtx());
+          << FP.toDiagnosticString(S.getASTContext());
       return false;
     }
   }
@@ -945,8 +945,8 @@ inline bool CmpHelper<Pointer>(InterpState &S, CodePtr OpPC, CompareFn Fn) {
   if (!Pointer::hasSameBase(LHS, RHS)) {
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified)
-        << LHS.toDiagnosticString(S.getCtx())
-        << RHS.toDiagnosticString(S.getCtx());
+        << LHS.toDiagnosticString(S.getASTContext())
+        << RHS.toDiagnosticString(S.getASTContext());
     return false;
   } else {
     unsigned VL = LHS.getByteOffset();
@@ -974,7 +974,7 @@ inline bool CmpHelperEQ<Pointer>(InterpState &S, CodePtr OpPC, CompareFn Fn) {
     if (P.isWeak()) {
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.FFDiag(Loc, diag::note_constexpr_pointer_weak_comparison)
-          << P.toDiagnosticString(S.getCtx());
+          << P.toDiagnosticString(S.getASTContext());
       return false;
     }
   }
@@ -984,13 +984,13 @@ inline bool CmpHelperEQ<Pointer>(InterpState &S, CodePtr OpPC, CompareFn Fn) {
         RHS.getOffset() == 0) {
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_past_end)
-          << LHS.toDiagnosticString(S.getCtx());
+          << LHS.toDiagnosticString(S.getASTContext());
       return false;
     } else if (RHS.isOnePastEnd() && !LHS.isOnePastEnd() && !LHS.isZero() &&
                LHS.getOffset() == 0) {
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_past_end)
-          << RHS.toDiagnosticString(S.getCtx());
+          << RHS.toDiagnosticString(S.getASTContext());
       return false;
     }
 
@@ -1073,8 +1073,8 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) {
     // This should only happen with pointers.
     const SourceInfo &Loc = S.Current->getSource(OpPC);
     S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified)
-        << LHS.toDiagnosticString(S.getCtx())
-        << RHS.toDiagnosticString(S.getCtx());
+        << LHS.toDiagnosticString(S.getASTContext())
+        << RHS.toDiagnosticString(S.getASTContext());
     return false;
   }
 
@@ -1342,7 +1342,7 @@ bool InitGlobalTemp(InterpState &S, CodePtr OpPC, uint32_t I,
   const Pointer &Ptr = S.P.getGlobal(I);
 
   const T Value = S.Stk.peek<T>();
-  APValue APV = Value.toAPValue(S.getCtx());
+  APValue APV = Value.toAPValue(S.getASTContext());
   APValue *Cached = Temp->getOrCreateValue(true);
   *Cached = APV;
 
@@ -1369,7 +1369,7 @@ inline bool InitGlobalTempComp(InterpState &S, CodePtr OpPC,
       std::make_pair(P.getDeclDesc()->asExpr(), Temp));
 
   if (std::optional<APValue> APV =
-          P.toRValue(S.getCtx(), Temp->getTemporaryExpr()->getType())) {
+          P.toRValue(S.getASTContext(), Temp->getTemporaryExpr()->getType())) {
     *Cached = *APV;
     return true;
   }
@@ -1404,7 +1404,8 @@ bool InitThisBitField(InterpState &S, CodePtr OpPC, const Record::Field *F,
     return false;
   const Pointer &Field = This.atField(FieldOffset);
   const auto &Value = S.Stk.pop<T>();
-  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue(S.getCtx()));
+  Field.deref<T>() =
+      Value.truncate(F->Decl->getBitWidthValue(S.getASTContext()));
   Field.initialize();
   return true;
 }
@@ -1427,7 +1428,8 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue(S.getCtx()));
+  Field.deref<T>() =
+      Value.truncate(F->Decl->getBitWidthValue(S.getASTContext()));
   Field.activate();
   Field.initialize();
   return true;
@@ -1477,7 +1479,7 @@ inline bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
     return false;
 
   if (Ptr.isIntegralPointer()) {
-    S.Stk.push<Pointer>(Ptr.asIntPointer().atOffset(S.getCtx(), Off));
+    S.Stk.push<Pointer>(Ptr.asIntPointer().atOffset(S.getASTContext(), Off));
     return true;
   }
 
@@ -1505,7 +1507,7 @@ inline bool GetPtrFieldPop(InterpState &S, CodePtr OpPC, uint32_t Off) {
     return false;
 
   if (Ptr.isIntegralPointer()) {
-    S.Stk.push<Pointer>(Ptr.asIntPointer().atOffset(S.getCtx(), Off));
+    S.Stk.push<Pointer>(Ptr.asIntPointer().atOffset(S.getASTContext(), Off));
     return true;
   }
 
@@ -1721,7 +1723,7 @@ bool StoreBitField(InterpState &S, CodePtr OpPC) {
   if (Ptr.canBeInitialized())
     Ptr.initialize();
   if (const auto *FD = Ptr.getField())
-    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue(S.getCtx()));
+    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue(S.getASTContext()));
   else
     Ptr.deref<T>() = Value;
   return true;
@@ -1736,7 +1738,7 @@ bool StoreBitFieldPop(InterpState &S, CodePtr OpPC) {
   if (Ptr.canBeInitialized())
     Ptr.initialize();
   if (const auto *FD = Ptr.getField())
-    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue(S.getCtx()));
+    Ptr.deref<T>() = Value.truncate(FD->getBitWidthValue(S.getASTContext()));
   else
     Ptr.deref<T>() = Value;
   return true;
@@ -2014,7 +2016,7 @@ inline bool SubPtr(InterpState &S, CodePtr OpPC) {
       while (auto *AT = dyn_cast<ArrayType>(PtrT))
         PtrT = AT->getElementType();
 
-      QualType ArrayTy = S.getCtx().getConstantArrayType(
+      QualType ArrayTy = S.getASTContext().getConstantArrayType(
           PtrT, APInt::getZero(1), nullptr, ArraySizeModifier::Normal, 0);
       S.FFDiag(S.Current->getSource(OpPC),
                diag::note_constexpr_pointer_subtraction_zero_size)
@@ -2953,7 +2955,7 @@ inline bool CheckDecl(InterpState &S, CodePtr OpPC, const VarDecl *VD) {
   if (VD == S.EvaluatingDecl)
     return true;
 
-  if (!VD->isUsableInConstantExpressions(S.getCtx())) {
+  if (!VD->isUsableInConstantExpressions(S.getASTContext())) {
     S.CCEDiag(VD->getLocation(), diag::note_constexpr_static_local)
         << (VD->getTSCSpec() == TSCS_unspecified ? 0 : 1) << VD;
     return false;
@@ -3047,7 +3049,7 @@ static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) {
     if (!Ptr.isRoot() || Ptr.isOnePastEnd() || Ptr.isArrayElement()) {
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.FFDiag(Loc, diag::note_constexpr_delete_subobject)
-          << Ptr.toDiagnosticString(S.getCtx()) << Ptr.isOnePastEnd();
+          << Ptr.toDiagnosticString(S.getASTContext()) << Ptr.isOnePastEnd();
       return false;
     }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 26abf58205106..1a71bff25d254 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -38,7 +38,7 @@ static T getParam(const InterpFrame *Frame, unsigned Index) {
 }
 
 PrimType getIntPrimType(const InterpState &S) {
-  const TargetInfo &TI = S.getCtx().getTargetInfo();
+  const TargetInfo &TI = S.getASTContext().getTargetInfo();
   unsigned IntWidth = TI.getIntWidth();
 
   if (IntWidth == 32)
@@ -49,7 +49,7 @@ PrimType getIntPrimType(const InterpState &S) {
 }
 
 PrimType getLongPrimType(const InterpState &S) {
-  const TargetInfo &TI = S.getCtx().getTargetInfo();
+  const TargetInfo &TI = S.getASTContext().getTargetInfo();
   unsigned LongWidth = TI.getLongWidth();
 
   if (LongWidth == 64)
@@ -272,10 +272,10 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
     return false;
 
   const llvm::fltSemantics &TargetSemantics =
-      S.getCtx().getFloatTypeSemantics(F->getDecl()->getReturnType());
+      S.getASTContext().getFloatTypeSemantics(F->getDecl()->getReturnType());
 
   Floating Result;
-  if (S.getCtx().getTargetInfo().isNan2008()) {
+  if (S.getASTContext().getTargetInfo().isNan2008()) {
     if (Signaling)
       Result = Floating(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
@@ -303,7 +303,7 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_inf(InterpState &S, CodePtr OpPC,
                                 const InterpFrame *Frame, const Function *F) {
   const llvm::fltSemantics &TargetSemantics =
-      S.getCtx().getFloatTypeSemantics(F->getDecl()->getReturnType());
+      S.getASTContext().getFloatTypeSemantics(F->getDecl()->getReturnType());
 
   S.Stk.push<Floating>(Floating::getInf(TargetSemantics));
   return true;
@@ -689,8 +689,8 @@ static bool interp__builtin_eh_return_data_regno(InterpState &S, CodePtr OpPC,
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Arg = peekToAPSInt(S.Stk, ArgT);
 
-  int Result =
-      S.getCtx().getTargetInfo().getEHDataRegisterNumber(Arg.getZExtValue());
+  int Result = S.getASTContext().getTargetInfo().getEHDataRegisterNumber(
+      Arg.getZExtValue());
   pushInteger(S, Result, Call->getType());
   return true;
 }
@@ -734,7 +734,7 @@ static bool interp__builtin_overflowop(InterpState &S, CodePtr OpPC,
                      ResultType->isSignedIntegerOrEnumerationType();
     uint64_t LHSSize = LHS.getBitWidth();
     uint64_t RHSSize = RHS.getBitWidth();
-    uint64_t ResultSize = S.getCtx().getTypeSize(ResultType);
+    uint64_t ResultSize = S.getASTContext().getTypeSize(ResultType);
     uint64_t MaxBits = std::max(std::max(LHSSize, RHSSize), ResultSize);
 
     // Add an additional bit if the signedness isn't uniformly agreed to. We
@@ -794,7 +794,7 @@ static bool interp__builtin_overflowop(InterpState &S, CodePtr OpPC,
     // since it will give us the behavior of a TruncOrSelf in the case where
     // its parameter <= its size.  We previously set Result to be at least the
     // type-size of the result, so getTypeSize(ResultType) <= Resu
-    APSInt Temp = Result.extOrTrunc(S.getCtx().getTypeSize(ResultType));
+    APSInt Temp = Result.extOrTrunc(S.getASTContext().getTypeSize(ResultType));
     Temp.setIsSigned(ResultType->isSignedIntegerOrEnumerationType());
 
     if (!APSInt::isSameValue(Temp, Result))
@@ -974,8 +974,8 @@ static bool interp__builtin_atomic_lock_free(InterpState &S, CodePtr OpPC,
   if (Size.isPowerOfTwo()) {
     // Check against inlining width.
     unsigned InlineWidthBits =
-        S.getCtx().getTargetInfo().getMaxAtomicInlineWidth();
-    if (Size <= S.getCtx().toCharUnitsFromBits(InlineWidthBits)) {
+        S.getASTContext().getTargetInfo().getMaxAtomicInlineWidth();
+    if (Size <= S.getASTContext().toCharUnitsFromBits(InlineWidthBits)) {
 
       // OK, we will inline appropriately-aligned operations of this size,
       // and _Atomic(T) is appropriately-aligned.
@@ -1007,7 +1007,7 @@ static bool interp__builtin_atomic_lock_free(InterpState &S, CodePtr OpPC,
       if (auto PtrTy = PtrArg->getType()->getAs<PointerType>()) {
         QualType PointeeType = PtrTy->getPointeeType();
         if (!PointeeType->isIncompleteType() &&
-            S.getCtx().getTypeAlignInChars(PointeeType) >= Size) {
+            S.getASTContext().getTypeAlignInChars(PointeeType) >= Size) {
           // OK, we will inline operations on this object.
           return returnBool(true);
         }
@@ -1059,7 +1059,7 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC,
     S.FFDiag(Call, diag::note_constexpr_invalid_alignment) << Alignment;
     return false;
   }
-  unsigned SrcWidth = S.getCtx().getIntWidth(Call->getArg(0)->getType());
+  unsigned SrcWidth = S.getASTContext().getIntWidth(Call->getArg(0)->getType());
   APSInt MaxValue(APInt::getOneBitSet(SrcWidth, SrcWidth - 1));
   if (APSInt::compareValues(Alignment, MaxValue) > 0) {
     S.FFDiag(Call, diag::note_constexpr_alignment_too_big)
@@ -1094,7 +1094,7 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC,
   unsigned PtrOffset = Ptr.getByteOffset();
   PtrOffset = Ptr.getIndex();
   CharUnits BaseAlignment =
-      S.getCtx().getDeclAlign(Ptr.getDeclDesc()->asValueDecl());
+      S.getASTContext().getDeclAlign(Ptr.getDeclDesc()->asValueDecl());
   CharUnits PtrAlign =
       BaseAlignment.alignmentAtOffset(CharUnits::fromQuantity(PtrOffset));
 
@@ -1157,7 +1157,7 @@ static bool interp__builtin_os_log_format_buffer_size(InterpState &S,
                                                       const Function *Func,
                                                       const CallExpr *Call) {
   analyze_os_log::OSLogBufferLayout Layout;
-  analyze_os_log::computeOSLogBufferLayout(S.getCtx(), Call, Layout);
+  analyze_os_log::computeOSLogBufferLayout(S.getASTContext(), Call, Layout);
   pushInteger(S, Layout.size().getQuantity(), Call->getType());
   return true;
 }
@@ -1624,10 +1624,11 @@ bool InterpretOffsetOf(InterpState &S, CodePtr OpPC, const OffsetOfExpr *E,
       const RecordDecl *RD = RT->getDecl();
       if (RD->isInvalidDecl())
         return false;
-      const ASTRecordLayout &RL = S.getCtx().getASTRecordLayout(RD);
+      const ASTRecordLayout &RL = S.getASTContext().getASTRecordLayout(RD);
       unsigned FieldIndex = MemberDecl->getFieldIndex();
       assert(FieldIndex < RL.getFieldCount() && "offsetof field in wrong type");
-      Result += S.getCtx().toCharUnitsFromBits(RL.getFieldOffset(FieldIndex));
+      Result +=
+          S.getASTContext().toCharUnitsFromBits(RL.getFieldOffset(FieldIndex));
       CurrentType = MemberDecl->getType().getNonReferenceType();
       break;
     }
@@ -1635,11 +1636,11 @@ bool InterpretOffsetOf(InterpState &S, CodePtr OpPC, const OffsetOfExpr *E,
       // When generating bytecode, we put all the index expressions as Sint64 on
       // the stack.
       int64_t Index = ArrayIndices[ArrayIndex];
-      const ArrayType *AT = S.getCtx().getAsArrayType(CurrentType);
+      const ArrayType *AT = S.getASTContext().getAsArrayType(CurrentType);
       if (!AT)
         return false;
       CurrentType = AT->getElementType();
-      CharUnits ElementSize = S.getCtx().getTypeSizeInChars(CurrentType);
+      CharUnits ElementSize = S.getASTContext().getTypeSizeInChars(CurrentType);
       Result += Index * ElementSize;
       ++ArrayIndex;
       break;
@@ -1656,7 +1657,7 @@ bool InterpretOffsetOf(InterpState &S, CodePtr OpPC, const OffsetOfExpr *E,
       const RecordDecl *RD = RT->getDecl();
       if (RD->isInvalidDecl())
         return false;
-      const ASTRecordLayout &RL = S.getCtx().getASTRecordLayout(RD);
+      const ASTRecordLayout &RL = S.getASTContext().getASTRecordLayout(RD);
 
       // Find the base class itself.
       CurrentType = BaseSpec->getType();
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index 8b55b61cbbfa7..5e98444ef05a5 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -179,7 +179,7 @@ void InterpFrame::describe(llvm::raw_ostream &OS) const {
     if (const auto *MCE = dyn_cast_if_present<CXXMemberCallExpr>(CallExpr)) {
       const Expr *Object = MCE->getImplicitObjectArgument();
       Object->printPretty(OS, /*Helper=*/nullptr,
-                          S.getCtx().getPrintingPolicy(),
+                          S.getASTContext().getPrintingPolicy(),
                           /*Indentation=*/0);
       if (Object->getType()->isPointerType())
         OS << "->";
@@ -188,18 +188,18 @@ void InterpFrame::describe(llvm::raw_ostream &OS) const {
     } else if (const auto *OCE =
                    dyn_cast_if_present<CXXOperatorCallExpr>(CallExpr)) {
       OCE->getArg(0)->printPretty(OS, /*Helper=*/nullptr,
-                                  S.getCtx().getPrintingPolicy(),
+                                  S.getASTContext().getPrintingPolicy(),
                                   /*Indentation=*/0);
       OS << ".";
     } else if (const auto *M = dyn_cast<CXXMethodDecl>(F)) {
-      print(OS, This, S.getCtx(),
-            S.getCtx().getLValueReferenceType(
-                S.getCtx().getRecordType(M->getParent())));
+      print(OS, This, S.getASTContext(),
+            S.getASTContext().getLValueReferenceType(
+                S.getASTContext().getRecordType(M->getParent())));
       OS << ".";
     }
   }
 
-  F->getNameForDiagnostic(OS, S.getCtx().getPrintingPolicy(),
+  F->getNameForDiagnostic(OS, S.getASTContext().getPrintingPolicy(),
                           /*Qualified=*/false);
   OS << '(';
   unsigned Off = 0;
@@ -212,7 +212,7 @@ void InterpFrame::describe(llvm::raw_ostream &OS) const {
 
     PrimType PrimTy = S.Ctx.classify(Ty).value_or(PT_Ptr);
 
-    TYPE_SWITCH(PrimTy, print(OS, stackRef<T>(Off), S.getCtx(), Ty));
+    TYPE_SWITCH(PrimTy, print(OS, stackRef<T>(Off), S.getASTContext(), Ty));
     Off += align(primSize(PrimTy));
     if (I + 1 != N)
       OS << ", ";
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index 61ee54331c65d..961ba5f5c28a0 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -59,7 +59,7 @@ class InterpState final : public State, public SourceMapper {
   Expr::EvalStatus &getEvalStatus() const override {
     return Parent.getEvalStatus();
   }
-  ASTContext &getCtx() const override { return Parent.getCtx(); }
+  ASTContext &getASTContext() const override { return Parent.getASTContext(); }
 
   // Forward status checks and updates to the walker.
   bool checkingForUndefinedBehavior() const override {
diff --git a/clang/lib/AST/ByteCode/State.cpp b/clang/lib/AST/ByteCode/State.cpp
index 0d9dadec4b958..b4db86e8d22c7 100644
--- a/clang/lib/AST/ByteCode/State.cpp
+++ b/clang/lib/AST/ByteCode/State.cpp
@@ -74,12 +74,12 @@ void State::addNotes(ArrayRef<PartialDiagnosticAt> Diags) {
 }
 
 DiagnosticBuilder State::report(SourceLocation Loc, diag::kind DiagId) {
-  return getCtx().getDiagnostics().Report(Loc, DiagId);
+  return getASTContext().getDiagnostics().Report(Loc, DiagId);
 }
 
 /// Add a diagnostic to the diagnostics list.
 PartialDiagnostic &State::addDiag(SourceLocation Loc, diag::kind DiagId) {
-  PartialDiagnostic PD(DiagId, getCtx().getDiagAllocator());
+  PartialDiagnostic PD(DiagId, getASTContext().getDiagAllocator());
   getEvalStatus().Diag->push_back(std::make_pair(Loc, PD));
   return getEvalStatus().Diag->back().second;
 }
@@ -93,7 +93,8 @@ OptionalDiagnostic State::diag(SourceLocation Loc, diag::kind DiagId,
     }
 
     unsigned CallStackNotes = getCallStackDepth() - 1;
-    unsigned Limit = getCtx().getDiagnostics().getConstexprBacktraceLimit();
+    unsigned Limit =
+        getASTContext().getDiagnostics().getConstexprBacktraceLimit();
     if (Limit)
       CallStackNotes = std::min(CallStackNotes, Limit + 1);
     if (checkingPotentialConstantExpression())
@@ -113,7 +114,9 @@ OptionalDiagnostic State::diag(SourceLocation Loc, diag::kind DiagId,
   return OptionalDiagnostic();
 }
 
-const LangOptions &State::getLangOpts() const { return getCtx().getLangOpts(); }
+const LangOptions &State::getLangOpts() const {
+  return getASTContext().getLangOpts();
+}
 
 void State::addCallStack(unsigned Limit) {
   // Determine which calls to skip, if any.
diff --git a/clang/lib/AST/ByteCode/State.h b/clang/lib/AST/ByteCode/State.h
index 44d6c037c5ad9..2cffce4bc2ae4 100644
--- a/clang/lib/AST/ByteCode/State.h
+++ b/clang/lib/AST/ByteCode/State.h
@@ -67,7 +67,7 @@ class State {
   virtual void setActiveDiagnostic(bool Flag) = 0;
   virtual void setFoldFailureDiagnostic(bool Flag) = 0;
   virtual Expr::EvalStatus &getEvalStatus() const = 0;
-  virtual ASTContext &getCtx() const = 0;
+  virtual ASTContext &getASTContext() const = 0;
   virtual bool hasPriorDiagnostic() = 0;
   virtual unsigned getCallStackDepth() = 0;
 
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 90caf81757ac9..1a07125815832 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2799,9 +2799,17 @@ bool VarDecl::isKnownToBeDefined() const {
 }
 
 bool VarDecl::isNoDestroy(const ASTContext &Ctx) const {
-  return hasGlobalStorage() && (hasAttr<NoDestroyAttr>() ||
-                                (!Ctx.getLangOpts().RegisterStaticDestructors &&
-                                 !hasAttr<AlwaysDestroyAttr>()));
+  if (!hasGlobalStorage())
+    return false;
+  if (hasAttr<NoDestroyAttr>())
+    return true;
+  if (hasAttr<AlwaysDestroyAttr>())
+    return false;
+
+  using RSDKind = LangOptions::RegisterStaticDestructorsKind;
+  RSDKind K = Ctx.getLangOpts().getRegisterStaticDestructors();
+  return K == RSDKind::None ||
+         (K == RSDKind::ThreadLocal && getTLSKind() == TLS_None);
 }
 
 QualType::DestructionKind
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 826cc5f58bdf5..d46f57521a97d 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -1030,7 +1030,7 @@ namespace {
       discardCleanups();
     }
 
-    ASTContext &getCtx() const override { return Ctx; }
+    ASTContext &getASTContext() const override { return Ctx; }
 
     void setEvaluatingDecl(APValue::LValueBase Base, APValue &Value,
                            EvaluatingDeclKind EDK = EvaluatingDeclKind::Ctor) {
@@ -2327,9 +2327,9 @@ static bool CheckLValueConstantExpression(EvalInfo &Info, SourceLocation Loc,
 
       // In CUDA/HIP device compilation, only device side variables have
       // constant addresses.
-      if (Info.getCtx().getLangOpts().CUDA &&
-          Info.getCtx().getLangOpts().CUDAIsDevice &&
-          Info.getCtx().CUDAConstantEvalCtx.NoWrongSidedVars) {
+      if (Info.getASTContext().getLangOpts().CUDA &&
+          Info.getASTContext().getLangOpts().CUDAIsDevice &&
+          Info.getASTContext().CUDAConstantEvalCtx.NoWrongSidedVars) {
         if ((!Var->hasAttr<CUDADeviceAttr>() &&
              !Var->hasAttr<CUDAConstantAttr>() &&
              !Var->getType()->isCUDADeviceBuiltinSurfaceType() &&
@@ -5662,7 +5662,7 @@ static EvalStmtResult EvaluateStmt(StmtResult &Result, EvalInfo &Info,
         *Info.CurrentCall, hasSpecificAttr<MSConstexprAttr>(AS->getAttrs()) &&
                                isa<ReturnStmt>(SS));
 
-    auto LO = Info.getCtx().getLangOpts();
+    auto LO = Info.getASTContext().getLangOpts();
     if (LO.CXXAssumptions && !LO.MSVCCompat) {
       for (auto *Attr : AS->getAttrs()) {
         auto *AA = dyn_cast<CXXAssumeAttr>(Attr);
@@ -5673,7 +5673,7 @@ static EvalStmtResult EvaluateStmt(StmtResult &Result, EvalInfo &Info,
         if (Assumption->isValueDependent())
           return ESR_Failed;
 
-        if (Assumption->HasSideEffects(Info.getCtx()))
+        if (Assumption->HasSideEffects(Info.getASTContext()))
           continue;
 
         bool Value;
diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index 08730a6a6672a..b8036cf6e6a30 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -14,6 +14,7 @@
 #include "CodeGenTypeCache.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GEPNoWrapFlags.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Type.h"
 
@@ -334,9 +335,10 @@ class CGBuilderTy : public CGBuilderBaseTy {
 
   Address CreateGEP(Address Addr, ArrayRef<llvm::Value *> IdxList,
                     llvm::Type *ElementType, CharUnits Align,
-                    const Twine &Name = "") {
+                    const Twine &Name = "",
+                    llvm::GEPNoWrapFlags NW = llvm::GEPNoWrapFlags::none()) {
     llvm::Value *Ptr = emitRawPointerFromAddress(Addr);
-    return RawAddress(CreateGEP(Addr.getElementType(), Ptr, IdxList, Name),
+    return RawAddress(CreateGEP(Addr.getElementType(), Ptr, IdxList, Name, NW),
                       ElementType, Align);
   }
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index af11bc20a3b63..5b55ec9d8064e 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/FixedPointBuilder.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GEPNoWrapFlags.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
@@ -5756,7 +5757,12 @@ CodeGenFunction::EmitCheckedInBoundsGEP(llvm::Type *ElemTy, Value *Ptr,
                                         bool SignedIndices, bool IsSubtraction,
                                         SourceLocation Loc, const Twine &Name) {
   llvm::Type *PtrTy = Ptr->getType();
-  Value *GEPVal = Builder.CreateInBoundsGEP(ElemTy, Ptr, IdxList, Name);
+
+  llvm::GEPNoWrapFlags NWFlags = llvm::GEPNoWrapFlags::inBounds();
+  if (!SignedIndices && !IsSubtraction)
+    NWFlags |= llvm::GEPNoWrapFlags::noUnsignedWrap();
+
+  Value *GEPVal = Builder.CreateGEP(ElemTy, Ptr, IdxList, Name, NWFlags);
 
   // If the pointer overflow sanitizer isn't enabled, do nothing.
   if (!SanOpts.has(SanitizerKind::PointerOverflow))
@@ -5871,8 +5877,13 @@ Address CodeGenFunction::EmitCheckedInBoundsGEP(
     Address Addr, ArrayRef<Value *> IdxList, llvm::Type *elementType,
     bool SignedIndices, bool IsSubtraction, SourceLocation Loc, CharUnits Align,
     const Twine &Name) {
-  if (!SanOpts.has(SanitizerKind::PointerOverflow))
-    return Builder.CreateInBoundsGEP(Addr, IdxList, elementType, Align, Name);
+  if (!SanOpts.has(SanitizerKind::PointerOverflow)) {
+    llvm::GEPNoWrapFlags NWFlags = llvm::GEPNoWrapFlags::inBounds();
+    if (!SignedIndices && !IsSubtraction)
+      NWFlags |= llvm::GEPNoWrapFlags::noUnsignedWrap();
+
+    return Builder.CreateGEP(Addr, IdxList, elementType, Align, Name, NWFlags);
+  }
 
   return RawAddress(
       EmitCheckedInBoundsGEP(Addr.getElementType(), Addr.emitRawPointer(*this),
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index e12416e51f8d2..43002add33774 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -171,18 +171,18 @@ getHIPOffloadTargetTriple(const Driver &D, const ArgList &Args) {
 }
 
 // static
-std::string Driver::GetResourcesPath(StringRef BinaryPath,
-                                     StringRef CustomResourceDir) {
+std::string Driver::GetResourcesPath(StringRef BinaryPath) {
   // Since the resource directory is embedded in the module hash, it's important
   // that all places that need it call this function, so that they get the
   // exact same string ("a/../b/" and "b/" get different hashes, for example).
 
   // Dir is bin/ or lib/, depending on where BinaryPath is.
-  std::string Dir = std::string(llvm::sys::path::parent_path(BinaryPath));
-
+  StringRef Dir = llvm::sys::path::parent_path(BinaryPath);
   SmallString<128> P(Dir);
-  if (CustomResourceDir != "") {
-    llvm::sys::path::append(P, CustomResourceDir);
+
+  StringRef ConfiguredResourceDir(CLANG_RESOURCE_DIR);
+  if (!ConfiguredResourceDir.empty()) {
+    llvm::sys::path::append(P, ConfiguredResourceDir);
   } else {
     // On Windows, libclang.dll is in bin/.
     // On non-Windows, libclang.so/.dylib is in lib/.
@@ -239,7 +239,7 @@ Driver::Driver(StringRef ClangExecutable, StringRef TargetTriple,
 #endif
 
   // Compute the path to the resource directory.
-  ResourceDir = GetResourcesPath(ClangExecutable, CLANG_RESOURCE_DIR);
+  ResourceDir = GetResourcesPath(ClangExecutable);
 }
 
 void Driver::setDriverMode(StringRef Value) {
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 9f1d57f43b656..df86941950e46 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7972,8 +7972,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                     options::OPT_fno_keep_persistent_storage_variables);
   Args.addOptInFlag(CmdArgs, options::OPT_fcomplete_member_pointers,
                     options::OPT_fno_complete_member_pointers);
-  Args.addOptOutFlag(CmdArgs, options::OPT_fcxx_static_destructors,
-                     options::OPT_fno_cxx_static_destructors);
+  if (Arg *A = Args.getLastArg(options::OPT_cxx_static_destructors_EQ))
+    A->render(Args, CmdArgs);
 
   addMachineOutlinerArgs(D, Args, CmdArgs, Triple, /*IsLTO=*/false);
 
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 6b85c7db90349..d77bb1d424f7c 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -1327,14 +1327,12 @@ DeclarationFragmentsBuilder::getFragmentsForFunctionTemplateSpecialization(
 
 DeclarationFragments
 DeclarationFragmentsBuilder::getFragmentsForMacro(StringRef Name,
-                                                  const MacroDirective *MD) {
+                                                  const MacroInfo *MI) {
   DeclarationFragments Fragments;
   Fragments.append("#define", DeclarationFragments::FragmentKind::Keyword)
       .appendSpace();
   Fragments.append(Name, DeclarationFragments::FragmentKind::Identifier);
 
-  auto *MI = MD->getMacroInfo();
-
   if (MI->isFunctionLike()) {
     Fragments.append("(", DeclarationFragments::FragmentKind::Text);
     unsigned numParameters = MI->getNumParams();
diff --git a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
index d6335854cbf26..0adc23280fd6c 100644
--- a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
+++ b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
@@ -286,78 +286,59 @@ class MacroCallback : public PPCallbacks {
   MacroCallback(const SourceManager &SM, APISet &API, Preprocessor &PP)
       : SM(SM), API(API), PP(PP) {}
 
-  void MacroDefined(const Token &MacroNameToken,
-                    const MacroDirective *MD) override {
-    auto *MacroInfo = MD->getMacroInfo();
+  void EndOfMainFile() override {
+    for (const auto &M : PP.macros()) {
+      auto *II = M.getFirst();
+      auto MD = PP.getMacroDefinition(II);
+      auto *MI = MD.getMacroInfo();
 
-    if (MacroInfo->isBuiltinMacro())
-      return;
+      if (!MI)
+        continue;
 
-    auto SourceLoc = MacroNameToken.getLocation();
-    if (SM.isWrittenInBuiltinFile(SourceLoc) ||
-        SM.isWrittenInCommandLineFile(SourceLoc))
-      return;
+      // Ignore header guard macros
+      if (MI->isUsedForHeaderGuard())
+        continue;
 
-    PendingMacros.emplace_back(MacroNameToken, MD);
-  }
+      // Ignore builtin macros and ones defined via the command line.
+      if (MI->isBuiltinMacro())
+        continue;
 
-  // If a macro gets undefined at some point during preprocessing of the inputs
-  // it means that it isn't an exposed API and we should therefore not add a
-  // macro definition for it.
-  void MacroUndefined(const Token &MacroNameToken, const MacroDefinition &MD,
-                      const MacroDirective *Undef) override {
-    // If this macro wasn't previously defined we don't need to do anything
-    // here.
-    if (!Undef)
-      return;
-
-    llvm::erase_if(PendingMacros, [&MD, this](const PendingMacro &PM) {
-      return MD.getMacroInfo()->isIdenticalTo(*PM.MD->getMacroInfo(), PP,
-                                              /*Syntactically*/ false);
-    });
-  }
+      auto DefLoc = MI->getDefinitionLoc();
 
-  void EndOfMainFile() override {
-    for (auto &PM : PendingMacros) {
-      // `isUsedForHeaderGuard` is only set when the preprocessor leaves the
-      // file so check for it here.
-      if (PM.MD->getMacroInfo()->isUsedForHeaderGuard())
+      if (SM.isWrittenInBuiltinFile(DefLoc) ||
+          SM.isWrittenInCommandLineFile(DefLoc))
         continue;
 
-      if (!shouldMacroBeIncluded(PM))
+      auto AssociatedModuleMacros = MD.getModuleMacros();
+      StringRef OwningModuleName;
+      if (!AssociatedModuleMacros.empty())
+        OwningModuleName = AssociatedModuleMacros.back()
+                               ->getOwningModule()
+                               ->getTopLevelModuleName();
+
+      if (!shouldMacroBeIncluded(DefLoc, OwningModuleName))
         continue;
 
-      StringRef Name = PM.MacroNameToken.getIdentifierInfo()->getName();
-      PresumedLoc Loc = SM.getPresumedLoc(PM.MacroNameToken.getLocation());
+      StringRef Name = II->getName();
+      PresumedLoc Loc = SM.getPresumedLoc(DefLoc);
       SmallString<128> USR;
-      index::generateUSRForMacro(Name, PM.MacroNameToken.getLocation(), SM,
-                                 USR);
-
+      index::generateUSRForMacro(Name, DefLoc, SM, USR);
       API.createRecord<extractapi::MacroDefinitionRecord>(
           USR, Name, SymbolReference(), Loc,
-          DeclarationFragmentsBuilder::getFragmentsForMacro(Name, PM.MD),
+          DeclarationFragmentsBuilder::getFragmentsForMacro(Name, MI),
           DeclarationFragmentsBuilder::getSubHeadingForMacro(Name),
-          SM.isInSystemHeader(PM.MacroNameToken.getLocation()));
+          SM.isInSystemHeader(DefLoc));
     }
-
-    PendingMacros.clear();
   }
 
-protected:
-  struct PendingMacro {
-    Token MacroNameToken;
-    const MacroDirective *MD;
-
-    PendingMacro(const Token &MacroNameToken, const MacroDirective *MD)
-        : MacroNameToken(MacroNameToken), MD(MD) {}
-  };
-
-  virtual bool shouldMacroBeIncluded(const PendingMacro &PM) { return true; }
+  virtual bool shouldMacroBeIncluded(const SourceLocation &MacroLoc,
+                                     StringRef ModuleName) {
+    return true;
+  }
 
   const SourceManager &SM;
   APISet &API;
   Preprocessor &PP;
-  llvm::SmallVector<PendingMacro> PendingMacros;
 };
 
 class APIMacroCallback : public MacroCallback {
@@ -366,9 +347,10 @@ class APIMacroCallback : public MacroCallback {
                    LocationFileChecker &LCF)
       : MacroCallback(SM, API, PP), LCF(LCF) {}
 
-  bool shouldMacroBeIncluded(const PendingMacro &PM) override {
+  bool shouldMacroBeIncluded(const SourceLocation &MacroLoc,
+                             StringRef ModuleName) override {
     // Do not include macros from external files
-    return LCF(PM.MacroNameToken.getLocation());
+    return LCF(MacroLoc) || API.ProductName == ModuleName;
   }
 
 private:
diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index 1bce9c59b1979..030509d378759 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -928,8 +928,8 @@ bool SymbolGraphSerializer::traverseObjCCategoryRecord(
     return true;
 
   auto *CurrentModule = ModuleForCurrentSymbol;
-  if (Record->isExtendingExternalModule())
-    ModuleForCurrentSymbol = &ExtendedModules[Record->Interface.Source];
+  if (auto ModuleExtendedByRecord = Record->getExtendedExternalModule())
+    ModuleForCurrentSymbol = &ExtendedModules[*ModuleExtendedByRecord];
 
   if (!walkUpFromObjCCategoryRecord(Record))
     return false;
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 0bb4175dd021e..32628c5e84332 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3130,7 +3130,7 @@ std::string CompilerInvocation::GetResourcesPath(const char *Argv0,
                                                  void *MainAddr) {
   std::string ClangExecutable =
       llvm::sys::fs::getMainExecutable(Argv0, MainAddr);
-  return Driver::GetResourcesPath(ClangExecutable, CLANG_RESOURCE_DIR);
+  return Driver::GetResourcesPath(ClangExecutable);
 }
 
 static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index ea57316ad8014..95f53dfefbcc5 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -17463,11 +17463,22 @@ static void RemoveNestedImmediateInvocation(
     ExprResult TransformInitializer(Expr *Init, bool NotCopyInit) {
       if (!Init)
         return Init;
+
+      // We cannot use IgnoreImpCasts because we need to preserve
+      // full expressions.
+      while (true) {
+        if (auto *ICE = dyn_cast<ImplicitCastExpr>(Init))
+          Init = ICE->getSubExpr();
+        else if (auto *ICE = dyn_cast<MaterializeTemporaryExpr>(Init))
+          Init = ICE->getSubExpr();
+        else
+          break;
+      }
       /// ConstantExpr are the first layer of implicit node to be removed so if
       /// Init isn't a ConstantExpr, no ConstantExpr will be skipped.
-      if (auto *CE = dyn_cast<ConstantExpr>(Init))
-        if (CE->isImmediateInvocation())
-          RemoveImmediateInvocation(CE);
+      if (auto *CE = dyn_cast<ConstantExpr>(Init);
+          CE && CE->isImmediateInvocation())
+        RemoveImmediateInvocation(CE);
       return Base::TransformInitializer(Init, NotCopyInit);
     }
     ExprResult TransformDeclRefExpr(DeclRefExpr *E) {
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 746c67ff1e979..d8719ab26cc83 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -39,6 +39,7 @@
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
+#include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaLambda.h"
 #include "clang/Sema/SemaObjC.h"
@@ -6248,6 +6249,23 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT, const TypeSourceI
                TSTToBeDeduced->getTemplateName().getAsTemplateDecl(), RhsT,
                Info) == TemplateDeductionResult::Success;
   }
+  case BTT_IsScalarizedLayoutCompatible: {
+    if (!LhsT->isVoidType() && !LhsT->isIncompleteArrayType() &&
+        Self.RequireCompleteType(Lhs->getTypeLoc().getBeginLoc(), LhsT,
+                                 diag::err_incomplete_type))
+      return true;
+    if (!RhsT->isVoidType() && !RhsT->isIncompleteArrayType() &&
+        Self.RequireCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT,
+                                 diag::err_incomplete_type))
+      return true;
+
+    DiagnoseVLAInCXXTypeTrait(
+        Self, Lhs, tok::kw___builtin_hlsl_is_scalarized_layout_compatible);
+    DiagnoseVLAInCXXTypeTrait(
+        Self, Rhs, tok::kw___builtin_hlsl_is_scalarized_layout_compatible);
+
+    return Self.HLSL().IsScalarizedLayoutCompatible(LhsT, RhsT);
+  }
   default:
     llvm_unreachable("not a BTT");
   }
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 17cb47f80590d..714e8f5cfa992 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1524,3 +1524,85 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   }
   return false;
 }
+
+static void BuildFlattenedTypeList(QualType BaseTy,
+                                   llvm::SmallVectorImpl<QualType> &List) {
+  llvm::SmallVector<QualType, 16> WorkList;
+  WorkList.push_back(BaseTy);
+  while (!WorkList.empty()) {
+    QualType T = WorkList.pop_back_val();
+    T = T.getCanonicalType().getUnqualifiedType();
+    assert(!isa<MatrixType>(T) && "Matrix types not yet supported in HLSL");
+    if (const auto *AT = dyn_cast<ConstantArrayType>(T)) {
+      llvm::SmallVector<QualType, 16> ElementFields;
+      // Generally I've avoided recursion in this algorithm, but arrays of
+      // structs could be time-consuming to flatten and churn through on the
+      // work list. Hopefully nesting arrays of structs containing arrays
+      // of structs too many levels deep is unlikely.
+      BuildFlattenedTypeList(AT->getElementType(), ElementFields);
+      // Repeat the element's field list n times.
+      for (uint64_t Ct = 0; Ct < AT->getZExtSize(); ++Ct)
+        List.insert(List.end(), ElementFields.begin(), ElementFields.end());
+      continue;
+    }
+    // Vectors can only have element types that are builtin types, so this can
+    // add directly to the list instead of to the WorkList.
+    if (const auto *VT = dyn_cast<VectorType>(T)) {
+      List.insert(List.end(), VT->getNumElements(), VT->getElementType());
+      continue;
+    }
+    if (const auto *RT = dyn_cast<RecordType>(T)) {
+      const RecordDecl *RD = RT->getDecl();
+      if (RD->isUnion()) {
+        List.push_back(T);
+        continue;
+      }
+      const CXXRecordDecl *CXXD = dyn_cast<CXXRecordDecl>(RD);
+
+      llvm::SmallVector<QualType, 16> FieldTypes;
+      if (CXXD && CXXD->isStandardLayout())
+        RD = CXXD->getStandardLayoutBaseWithFields();
+
+      for (const auto *FD : RD->fields())
+        FieldTypes.push_back(FD->getType());
+      // Reverse the newly added sub-range.
+      std::reverse(FieldTypes.begin(), FieldTypes.end());
+      WorkList.insert(WorkList.end(), FieldTypes.begin(), FieldTypes.end());
+
+      // If this wasn't a standard layout type we may also have some base
+      // classes to deal with.
+      if (CXXD && !CXXD->isStandardLayout()) {
+        FieldTypes.clear();
+        for (const auto &Base : CXXD->bases())
+          FieldTypes.push_back(Base.getType());
+        std::reverse(FieldTypes.begin(), FieldTypes.end());
+        WorkList.insert(WorkList.end(), FieldTypes.begin(), FieldTypes.end());
+      }
+      continue;
+    }
+    List.push_back(T);
+  }
+}
+
+bool SemaHLSL::IsScalarizedLayoutCompatible(QualType T1, QualType T2) const {
+  if (T1.isNull() || T2.isNull())
+    return false;
+
+  T1 = T1.getCanonicalType().getUnqualifiedType();
+  T2 = T2.getCanonicalType().getUnqualifiedType();
+
+  // If both types are the same canonical type, they're obviously compatible.
+  if (SemaRef.getASTContext().hasSameType(T1, T2))
+    return true;
+
+  llvm::SmallVector<QualType, 16> T1Types;
+  BuildFlattenedTypeList(T1, T1Types);
+  llvm::SmallVector<QualType, 16> T2Types;
+  BuildFlattenedTypeList(T2, T2Types);
+
+  // Check the flattened type list
+  return llvm::equal(T1Types, T2Types,
+                     [this](QualType LHS, QualType RHS) -> bool {
+                       return SemaRef.IsLayoutCompatible(LHS, RHS);
+                     });
+}
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index fd90f83f3976c..786daea5bab31 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -20,6 +20,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprConcepts.h"
 #include "clang/AST/PrettyDeclStackTrace.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/TypeVisitor.h"
@@ -88,12 +89,19 @@ struct Response {
 // than lambda classes.
 const FunctionDecl *
 getPrimaryTemplateOfGenericLambda(const FunctionDecl *LambdaCallOperator) {
+  if (!isLambdaCallOperator(LambdaCallOperator))
+    return LambdaCallOperator;
   while (true) {
     if (auto *FTD = dyn_cast_if_present<FunctionTemplateDecl>(
             LambdaCallOperator->getDescribedTemplate());
         FTD && FTD->getInstantiatedFromMemberTemplate()) {
       LambdaCallOperator =
           FTD->getInstantiatedFromMemberTemplate()->getTemplatedDecl();
+    } else if (LambdaCallOperator->getPrimaryTemplate()) {
+      // Cases where the lambda operator is instantiated in
+      // TemplateDeclInstantiator::VisitCXXMethodDecl.
+      LambdaCallOperator =
+          LambdaCallOperator->getPrimaryTemplate()->getTemplatedDecl();
     } else if (auto *Prev = cast<CXXMethodDecl>(LambdaCallOperator)
                                 ->getInstantiatedFromMemberFunction())
       LambdaCallOperator = Prev;
@@ -139,22 +147,28 @@ getEnclosingTypeAliasTemplateDecl(Sema &SemaRef) {
 // Check if we are currently inside of a lambda expression that is
 // surrounded by a using alias declaration. e.g.
 //   template <class> using type = decltype([](auto) { ^ }());
-// By checking if:
-//  1. The lambda expression and the using alias declaration share the
-//  same declaration context.
-//  2. They have the same template depth.
 // We have to do so since a TypeAliasTemplateDecl (or a TypeAliasDecl) is never
 // a DeclContext, nor does it have an associated specialization Decl from which
 // we could collect these template arguments.
 bool isLambdaEnclosedByTypeAliasDecl(
-    const FunctionDecl *PrimaryLambdaCallOperator,
+    const FunctionDecl *LambdaCallOperator,
     const TypeAliasTemplateDecl *PrimaryTypeAliasDecl) {
-  return cast<CXXRecordDecl>(PrimaryLambdaCallOperator->getDeclContext())
-                 ->getTemplateDepth() ==
-             PrimaryTypeAliasDecl->getTemplateDepth() &&
-         getLambdaAwareParentOfDeclContext(
-             const_cast<FunctionDecl *>(PrimaryLambdaCallOperator)) ==
-             PrimaryTypeAliasDecl->getDeclContext();
+  struct Visitor : RecursiveASTVisitor<Visitor> {
+    Visitor(const FunctionDecl *CallOperator) : CallOperator(CallOperator) {}
+    bool VisitLambdaExpr(const LambdaExpr *LE) {
+      // Return true to bail out of the traversal, implying the Decl contains
+      // the lambda.
+      return getPrimaryTemplateOfGenericLambda(LE->getCallOperator()) !=
+             CallOperator;
+    }
+    const FunctionDecl *CallOperator;
+  };
+
+  QualType Underlying =
+      PrimaryTypeAliasDecl->getTemplatedDecl()->getUnderlyingType();
+
+  return !Visitor(getPrimaryTemplateOfGenericLambda(LambdaCallOperator))
+              .TraverseType(Underlying);
 }
 
 // Add template arguments from a variable template instantiation.
@@ -293,23 +307,8 @@ Response HandleFunction(Sema &SemaRef, const FunctionDecl *Function,
 
     // If this function is a generic lambda specialization, we are done.
     if (!ForConstraintInstantiation &&
-        isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function)) {
-      // TypeAliasTemplateDecls should be taken into account, e.g.
-      // when we're deducing the return type of a lambda.
-      //
-      // template <class> int Value = 0;
-      // template <class T>
-      // using T = decltype([]<int U = 0>() { return Value<T>; }());
-      //
-      if (auto TypeAlias = getEnclosingTypeAliasTemplateDecl(SemaRef)) {
-        if (isLambdaEnclosedByTypeAliasDecl(
-                /*PrimaryLambdaCallOperator=*/getPrimaryTemplateOfGenericLambda(
-                    Function),
-                /*PrimaryTypeAliasDecl=*/TypeAlias.PrimaryTypeAliasDecl))
-          return Response::UseNextDecl(Function);
-      }
+        isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function))
       return Response::Done();
-    }
 
   } else if (Function->getDescribedFunctionTemplate()) {
     assert(
@@ -421,10 +420,9 @@ Response HandleRecordDecl(Sema &SemaRef, const CXXRecordDecl *Rec,
     // Retrieve the template arguments for a using alias declaration.
     // This is necessary for constraint checking, since we always keep
     // constraints relative to the primary template.
-    if (auto TypeAlias = getEnclosingTypeAliasTemplateDecl(SemaRef)) {
-      const FunctionDecl *PrimaryLambdaCallOperator =
-          getPrimaryTemplateOfGenericLambda(Rec->getLambdaCallOperator());
-      if (isLambdaEnclosedByTypeAliasDecl(PrimaryLambdaCallOperator,
+    if (auto TypeAlias = getEnclosingTypeAliasTemplateDecl(SemaRef);
+        ForConstraintInstantiation && TypeAlias) {
+      if (isLambdaEnclosedByTypeAliasDecl(Rec->getLambdaCallOperator(),
                                           TypeAlias.PrimaryTypeAliasDecl)) {
         Result.addOuterTemplateArguments(TypeAlias.Template,
                                          TypeAlias.AssociatedTemplateArguments,
@@ -1647,12 +1645,17 @@ namespace {
 
     CXXRecordDecl::LambdaDependencyKind
     ComputeLambdaDependency(LambdaScopeInfo *LSI) {
-      auto &CCS = SemaRef.CodeSynthesisContexts.back();
-      if (CCS.Kind ==
-          Sema::CodeSynthesisContext::TypeAliasTemplateInstantiation) {
-        unsigned TypeAliasDeclDepth = CCS.Entity->getTemplateDepth();
+      if (auto TypeAlias =
+              TemplateInstArgsHelpers::getEnclosingTypeAliasTemplateDecl(
+                  getSema());
+          TypeAlias && TemplateInstArgsHelpers::isLambdaEnclosedByTypeAliasDecl(
+                           LSI->CallOperator, TypeAlias.PrimaryTypeAliasDecl)) {
+        unsigned TypeAliasDeclDepth = TypeAlias.Template->getTemplateDepth();
         if (TypeAliasDeclDepth >= TemplateArgs.getNumSubstitutedLevels())
           return CXXRecordDecl::LambdaDependencyKind::LDK_AlwaysDependent;
+        for (const TemplateArgument &TA : TypeAlias.AssociatedTemplateArguments)
+          if (TA.isDependent())
+            return CXXRecordDecl::LambdaDependencyKind::LDK_AlwaysDependent;
       }
       return inherited::ComputeLambdaDependency(LSI);
     }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 62287c2d26375..b3854cd8f8222 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -6669,9 +6669,15 @@ QualType
 TreeTransform<Derived>::TransformPackIndexingType(TypeLocBuilder &TLB,
                                                   PackIndexingTypeLoc TL) {
   // Transform the index
-  ExprResult IndexExpr = getDerived().TransformExpr(TL.getIndexExpr());
-  if (IndexExpr.isInvalid())
-    return QualType();
+  ExprResult IndexExpr;
+  {
+    EnterExpressionEvaluationContext ConstantContext(
+        SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+    IndexExpr = getDerived().TransformExpr(TL.getIndexExpr());
+    if (IndexExpr.isInvalid())
+      return QualType();
+  }
   QualType Pattern = TL.getPattern();
 
   const PackIndexingType *PIT = TL.getTypePtr();
@@ -15299,9 +15305,14 @@ TreeTransform<Derived>::TransformPackIndexingExpr(PackIndexingExpr *E) {
     return E;
 
   // Transform the index
-  ExprResult IndexExpr = getDerived().TransformExpr(E->getIndexExpr());
-  if (IndexExpr.isInvalid())
-    return ExprError();
+  ExprResult IndexExpr;
+  {
+    EnterExpressionEvaluationContext ConstantContext(
+        SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+    IndexExpr = getDerived().TransformExpr(E->getIndexExpr());
+    if (IndexExpr.isInvalid())
+      return ExprError();
+  }
 
   SmallVector<Expr *, 5> ExpandedExprs;
   if (!E->expandsToEmptyPack() && E->getExpressions().empty()) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index 60934e51febe8..04472bb3895a7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -692,6 +692,14 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
   NullConstraint Nullness = getNullConstraint(*RetSVal, State);
 
   Nullability RequiredNullability = getNullabilityAnnotation(RequiredRetType);
+  if (const auto *FunDecl = C.getLocationContext()->getDecl();
+      FunDecl && FunDecl->getAttr<ReturnsNonNullAttr>() &&
+      (RequiredNullability == Nullability::Unspecified ||
+       RequiredNullability == Nullability::Nullable)) {
+    // If a function is marked with the returns_nonnull attribute,
+    // the return value must be non-null.
+    RequiredNullability = Nullability::Nonnull;
+  }
 
   // If the returned value is null but the type of the expression
   // generating it is nonnull then we will suppress the diagnostic.
@@ -705,7 +713,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
                                   Nullness == NullConstraint::IsNull);
   if (ChecksEnabled[CK_NullReturnedFromNonnull] && NullReturnedFromNonNull &&
       RetExprTypeLevelNullability != Nullability::Nonnull &&
-      !InSuppressedMethodFamily && C.getLocationContext()->inTopFrame()) {
+      !InSuppressedMethodFamily) {
     static CheckerProgramPointTag Tag(this, "NullReturnedFromNonnull");
     ExplodedNode *N = C.generateErrorNode(State, &Tag);
     if (!N)
diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp
index a7be4102fd0a0..d733e3182fd59 100644
--- a/clang/test/AST/ByteCode/new-delete.cpp
+++ b/clang/test/AST/ByteCode/new-delete.cpp
@@ -404,7 +404,7 @@ constexpr typename std::remove_reference<T>::type&& move(T &&t) noexcept {
 
 namespace cxx2a {
 struct A {
-  int* p = new int(42); // both-note 7{{heap allocation performed here}}
+  int* p = new int(42); // both-note 3{{heap allocation performed here}}
   consteval int ret_i() const { return p ? *p : 0; }
   consteval A ret_a() const { return A{}; }
   constexpr ~A() { delete p; }
@@ -433,9 +433,7 @@ void test() {
   { A k = to_lvalue_ref(A()); } // both-error {{'cxx2a::to_lvalue_ref' is not a constant expression}} \
                                 // both-note {{reference to temporary is not a constant expression}} \
                                 // both-note {{temporary created here}}
-  { A k = to_lvalue_ref(A().ret_a()); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \
-                                        // both-note {{heap-allocated object is not a constant expression}} \
-                                        // both-error {{'cxx2a::to_lvalue_ref' is not a constant expression}} \
+  { A k = to_lvalue_ref(A().ret_a()); } // both-error {{'cxx2a::to_lvalue_ref' is not a constant expression}} \
                                         // both-note {{reference to temporary is not a constant expression}} \
                                         // both-note {{temporary created here}}
   { int k = A().ret_a().ret_i(); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \
@@ -445,19 +443,15 @@ void test() {
   { int k = const_a_ref(a); }
   { int k = rvalue_ref(A()); }
   { int k = rvalue_ref(std::move(a)); }
-  { int k = const_a_ref(A().ret_a()); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \
-                                        // both-note {{is not a constant expression}}
-  { int k = const_a_ref(to_lvalue_ref(A().ret_a())); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \
-                                                       // both-note {{is not a constant expression}}
+  { int k = const_a_ref(A().ret_a()); }
+  { int k = const_a_ref(to_lvalue_ref(A().ret_a())); }
   { int k = const_a_ref(to_lvalue_ref(std::move(a))); }
   { int k = by_value_a(A().ret_a()); }
   { int k = by_value_a(to_lvalue_ref(static_cast<const A&&>(a))); }
   { int k = (A().ret_a(), A().ret_i()); } // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \
                                           // both-note {{is not a constant expression}} \
                                           // both-warning {{left operand of comma operator has no effect}}
-  { int k = (const_a_ref(A().ret_a()), A().ret_i()); }  // both-error {{'cxx2a::A::ret_a' is not a constant expression}} \
-                                                        // both-note {{is not a constant expression}} \
-                                                        // both-warning {{left operand of comma operator has no effect}}
+  { int k = (const_a_ref(A().ret_a()), A().ret_i()); } // both-warning {{left operand of comma operator has no effect}}
 }
 }
 
diff --git a/clang/test/Analysis/nullability.c b/clang/test/Analysis/nullability.c
index fbc03c864ad83..57ce9ac8aee3d 100644
--- a/clang/test/Analysis/nullability.c
+++ b/clang/test/Analysis/nullability.c
@@ -1,12 +1,65 @@
-// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker=core,nullability -Wno-deprecated-non-prototype -verify %s
+// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker=core,nullability,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached(void);
 
 void it_takes_two(int a, int b);
-void function_pointer_arity_mismatch() {
+void function_pointer_arity_mismatch(void) {
   void(*fptr)() = it_takes_two;
   fptr(1); // no-crash expected-warning {{Function taking 2 arguments is called with fewer (1)}}
+  // expected-warning@-1 {{passing arguments to a function without a prototype is deprecated in all versions of C and is not supported in C23}}
 }
 
-void block_arity_mismatch() {
+void block_arity_mismatch(void) {
   void(^b)() = ^(int a, int b) { };
   b(1);  // no-crash expected-warning {{Block taking 2 arguments is called with fewer (1)}}
+  // expected-warning@-1 {{passing arguments to a function without a prototype is deprecated in all versions of C and is not supported in C23}}
+}
+
+int *nonnull_return_annotation_indirect(void) __attribute__((returns_nonnull));
+int *nonnull_return_annotation_indirect(void) {
+  int *x = 0;
+  return x; // expected-warning {{Null returned from a function that is expected to return a non-null value}}
+}
+
+int *nonnull_return_annotation_direct(void) __attribute__((returns_nonnull));
+int *nonnull_return_annotation_direct(void) {
+  return 0; // expected-warning {{Null returned from a function that is expected to return a non-null value}}
+} // expected-warning@-1 {{null returned from function that requires a non-null return value}}
+
+int *nonnull_return_annotation_assumed(int* ptr) __attribute__((returns_nonnull));
+int *nonnull_return_annotation_assumed(int* ptr) {
+  if (ptr) {
+    return ptr;
+  }
+  return ptr; // expected-warning {{Null returned from a function that is expected to return a non-null value}}
+}
+
+int *produce_nonnull_ptr(void) __attribute__((returns_nonnull));
+
+__attribute__((returns_nonnull))
+int *cannot_return_null(void) {
+  int *x = produce_nonnull_ptr();
+  if (!x) {
+    clang_analyzer_warnIfReached();
+    // expected-warning@-1 {{REACHABLE}}
+    // TODO: This warning is a false positive, according to the contract of
+    // produce_nonnull_ptr, x cannot be null.
+  }
+  // Regardless of the potential state split above, x cannot be nullptr
+  // according to the produce_nonnull_ptr annotation.
+  return x;
+  // False positive: expected-warning@-1 {{Null returned from a function that is expected to return a non-null value}}
+}
+
+__attribute__((returns_nonnull)) int *passthrough(int *p) {
+  return p; // no-warning: we have no evidence that `p` is null, i.e., violating the contract
+}
+
+__attribute__((returns_nonnull)) int *passthrough2(int *p);
+int *passthrough2(int *p) {
+  return p; // expected-warning{{Null returned from a function that is expected to return a non-null value}}
+}
+
+void call_with_null(void) {
+  passthrough2(0);
 }
diff --git a/clang/test/Analysis/nullability.mm b/clang/test/Analysis/nullability.mm
index d69116d03df74..64222d939bdd0 100644
--- a/clang/test/Analysis/nullability.mm
+++ b/clang/test/Analysis/nullability.mm
@@ -438,7 +438,7 @@ -(Dummy *)callerWithParam:(Dummy * _Nonnull) p1 {
 
 int * _Nonnull InlinedReturnNullOverSuppressionCallee(int * _Nonnull p2) {
   int *result = 0;
-  return result; // no-warning; but this is an over suppression
+  return result; // expected-warning{{Null returned from a function that is expected to return a non-null value}}
 }
 
 int *InlinedReturnNullOverSuppressionCaller(int * _Nonnull p1) {
diff --git a/clang/test/CodeGen/2005-01-02-ConstantInits.c b/clang/test/CodeGen/2005-01-02-ConstantInits.c
index 7772a64331ffb..d90c2ea42da61 100644
--- a/clang/test/CodeGen/2005-01-02-ConstantInits.c
+++ b/clang/test/CodeGen/2005-01-02-ConstantInits.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --global-value-regex "@.+"
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --global-value-regex "[A-Za-z].*"
 // RUN: %clang_cc1 -triple=x86_64-unknown-linux %s -emit-llvm -o - | FileCheck %s
 
 // This tests all kinds of hard cases with initializers and
@@ -51,7 +51,7 @@ int foo(int i) { return bar(&Arr[49])+bar(&Arr[i]); }
 // CHECK-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
 // CHECK-NEXT:    store ptr @Arr, ptr [[P]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[P]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I_ADDR]], align 4
 // CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[TMP1]] to i64
diff --git a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
index a3650beec625f..4c4d0dfce05ea 100644
--- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
@@ -1012,14 +1012,14 @@ test_shuffle() {
 // CHECK: %[[SHR:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR]], 3
 // CHECK: sext i32 %[[AND4]] to i64
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_epi32.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK: add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
 // CHECK: call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])
 
@@ -1050,7 +1050,7 @@ test_shuffle() {
 // CHECK: sext i32 %[[AND4]] to i64
 // CHECK-LE: store <2 x i64> <i64 1663540288323457296, i64 0>, ptr %{{[0-9a-zA-Z_.]+}}, align 16
 // CHECK-BE: store <2 x i64> <i64 1157726452361532951, i64 0>, ptr %{{[0-9a-zA-Z_.]+}}, align 16
-// CHECK-COUNT-4: getelementptr inbounds [4 x i16], ptr @_mm_shufflehi_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
+// CHECK-COUNT-4: getelementptr inbounds nuw [4 x i16], ptr @_mm_shufflehi_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
 // CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])
 
 // CHECK-LABEL: define available_externally <2 x i64> @_mm_shufflelo_epi16
@@ -1067,7 +1067,7 @@ test_shuffle() {
 // CHECK: sext i32 %[[AND4]] to i64
 // CHECK-LE: store <2 x i64> <i64 0, i64 2242261671028070680>, ptr %{{[0-9a-zA-Z_.]+}}, align 16
 // CHECK-BE: store <2 x i64> <i64 0, i64 1736447835066146335>, ptr %{{[0-9a-zA-Z_.]+}}, align 16
-// CHECK-COUNT-4: getelementptr inbounds [4 x i16], ptr @_mm_shufflelo_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
+// CHECK-COUNT-4: getelementptr inbounds nuw [4 x i16], ptr @_mm_shufflelo_epi16.__permute_selectors, i64 0, i64 {{[0-9a-zA-Z_%.]+}}
 // CHECK: call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])
 
 void __attribute__((noinline))
diff --git a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
index 95dfd1202f157..4a15fa9f76cee 100644
--- a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
@@ -894,16 +894,16 @@ test_shuffle() {
 // CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3
 // CHECK: sext i32 %[[AND4]] to i64
-// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0
 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3
-// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1
 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2
-// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 2
 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 1
-// CHECK: getelementptr inbounds [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: getelementptr inbounds nuw [4 x i16], ptr @_mm_shuffle_pi16.__permute_selectors, i64 0, i64 %{{[0-9a-zA-Z_.]+}}
 // CHECK-LE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 3
 // CHECK-BE: getelementptr inbounds [4 x i16], ptr %{{[0-9a-zA-Z_.]+}}, i64 0, i64 0
 // CHECK: call <2 x i64> @vec_splats(unsigned long long)
@@ -923,14 +923,14 @@ test_shuffle() {
 // CHECK: %[[SHR3:[0-9a-zA-Z_.]+]] = ashr i32 %{{[0-9a-zA-Z_.]+}}, 6
 // CHECK: %[[AND4:[0-9a-zA-Z_.]+]] = and i32 %[[SHR3]], 3
 // CHECK: sext i32 %[[AND4]] to i64
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 0
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 1
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
 // CHECK: %[[ADD:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD]], i32 2
-// CHECK: getelementptr inbounds [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
+// CHECK: getelementptr inbounds nuw [4 x i32], ptr @_mm_shuffle_ps.__permute_selectors, i64 0, i64
 // CHECK: %[[ADD2:[0-9a-zA-Z_.]+]] = add i32 %{{[0-9a-zA-Z_.]+}}, 269488144
 // CHECK: insertelement <4 x i32> %{{[0-9a-zA-Z_.]+}}, i32 %[[ADD2]], i32 3
 // CHECK: call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 3ed8b6f0c7186..f3b1653d3632a 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -118,7 +118,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP2]]
@@ -134,7 +134,7 @@ void test1(struct annotated *p, int index, int val) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP0]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
@@ -142,7 +142,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -150,7 +150,7 @@ void test1(struct annotated *p, int index, int val) {
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -207,7 +207,7 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP3]], i64 4)
@@ -231,7 +231,7 @@ size_t test2_bdos(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
@@ -239,7 +239,7 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -247,7 +247,7 @@ size_t test2_bdos(struct annotated *p) {
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c
index 39ede01d6e3b8..8a560a47ad1e1 100644
--- a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c
+++ b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset-when-nullptr-is-defined.c
@@ -33,7 +33,7 @@ char *add_unsigned(char *base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
diff --git a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
index e93dbcb9f647b..d884993ffb2b3 100644
--- a/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
+++ b/clang/test/CodeGen/catch-nullptr-and-nonzero-offset.c
@@ -50,7 +50,7 @@ char *var_var(char *base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -83,7 +83,7 @@ char *var_zero(char *base) {
   // CHECK-NEXT:                          %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                          store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                          %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                          %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 0
+  // CHECK-NEXT:                          %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 0
   // CHECK-SANITIZE-C-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-C-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
@@ -111,7 +111,7 @@ char *var_one(char *base) {
   // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 1
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 1
   // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
@@ -140,7 +140,7 @@ char *var_allones(char *base) {
   // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 -1
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 -1
   // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], -1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
@@ -171,7 +171,7 @@ char *nullptr_var(unsigned long offset) {
   // CHECK-NEXT:                        %[[OFFSET_ADDR:.*]] = alloca i64, align 8
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr null, i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr null, i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -217,17 +217,17 @@ char *nullptr_zero(void) {
 char *nullptr_one_BAD(void) {
   // CHECK:                           define{{.*}} ptr @nullptr_one_BAD()
   // CHECK-NEXT:                      [[ENTRY:.*]]:
-  // CHECK-SANITIZE-NEXT:             %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:             %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:           %[[COND:.*]] = and i1 false, %[[CMP]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:         %[[COND:.*]] = icmp eq i1 false, %[[CMP]], !nosanitize
   // CHECK-SANITIZE-NEXT:             br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 1) to i64))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_700]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 1) to i64))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr null, i64 1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr null, i64 1)
   static char *const base = (char *)0;
   static const unsigned long offset = 1;
 #line 700
@@ -237,17 +237,17 @@ char *nullptr_one_BAD(void) {
 char *nullptr_allones_BAD(void) {
   // CHECK:                           define{{.*}} ptr @nullptr_allones_BAD()
   // CHECK-NEXT:                      [[ENTRY:.*]]:
-  // CHECK-SANITIZE-NEXT:             %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:             %[[CMP:.*]] = icmp ne i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:           %[[COND:.*]] = and i1 false, %[[CMP]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:         %[[COND:.*]] = icmp eq i1 false, %[[CMP]], !nosanitize
   // CHECK-SANITIZE-NEXT:             br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds (i8, ptr null, i64 -1) to i64))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_800]], i64 0, i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr null, i64 -1) to i64))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr null, i64 -1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr null, i64 -1)
   static char *const base = (char *)0;
   static const unsigned long offset = -1;
 #line 800
@@ -262,7 +262,7 @@ char *one_var(unsigned long offset) {
   // CHECK-NEXT:                        %[[OFFSET_ADDR:.*]] = alloca i64, align 8
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr inttoptr (i64 1 to ptr), i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr inttoptr (i64 1 to ptr), i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -312,17 +312,17 @@ char *one_one_OK(void) {
   // CHECK:                           define{{.*}} ptr @one_one_OK()
   // CHECK-NEXT:                      [[ENTRY:.*]]:
   // CHECK-SANITIZE-NEXT:               %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 1 to ptr), null, !nosanitize
-  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:             %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:           %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1100]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1) to i64), i64 1), i64 1))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 1)
   static char *const base = (char *)1;
   static const unsigned long offset = 1;
 #line 1100
@@ -333,17 +333,17 @@ char *one_allones_BAD(void) {
   // CHECK:                           define{{.*}} ptr @one_allones_BAD()
   // CHECK-NEXT:                      [[ENTRY:.*]]:
   // CHECK-SANITIZE-NEXT:               %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 1 to ptr), null, !nosanitize
-  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:             %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:           %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1200]], i64 1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1) to i64), i64 1), i64 1))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 1 to ptr), i64 -1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 1 to ptr), i64 -1)
   static char *const base = (char *)1;
   static const unsigned long offset = -1;
 #line 1200
@@ -358,7 +358,7 @@ char *allones_var(unsigned long offset) {
   // CHECK-NEXT:                        %[[OFFSET_ADDR:.*]] = alloca i64, align 8
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr inttoptr (i64 -1 to ptr), i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr inttoptr (i64 -1 to ptr), i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -408,17 +408,17 @@ char *allones_one_BAD(void) {
   // CHECK: define{{.*}} ptr @allones_one_BAD()
   // CHECK-NEXT: [[ENTRY:.*]]:
   // CHECK-SANITIZE-NEXT:               %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 -1 to ptr), null, !nosanitize
-  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:             %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:           %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1500]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1) to i64), i64 -1), i64 -1))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 1)
   static char *const base = (char *)-1;
   static const unsigned long offset = 1;
 #line 1500
@@ -429,17 +429,17 @@ char *allones_allones_OK(void) {
   // CHECK: define{{.*}} ptr @allones_allones_OK()
   // CHECK-NEXT: [[ENTRY:.*]]:
   // CHECK-SANITIZE-NEXT:               %[[CMP1:.*]] = icmp ne ptr inttoptr (i64 -1 to ptr), null, !nosanitize
-  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1), 0, !nosanitize
+  // CHECK-SANITIZE-NEXT:               %[[CMP2:.*]] = icmp ne i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1), 0, !nosanitize
   // CHECK-SANITIZE-C-NEXT:             %[[COND:.*]] = and i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-CPP-NEXT:           %[[COND:.*]] = icmp eq i1 %[[CMP1]], %[[CMP2]], !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[COND]], label %[[CONT:.*]], label %[[HANDLER_POINTER_OVERFLOW:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_POINTER_OVERFLOW]]:
-  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1))
-  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1))
+  // CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_pointer_overflow_abort(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1))
+  // CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_pointer_overflow(ptr @[[LINE_1600]], i64 -1, i64 add (i64 sub (i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1) to i64), i64 -1), i64 -1))
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.ubsantrap(i8 19){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        ret ptr getelementptr inbounds (i8, ptr inttoptr (i64 -1 to ptr), i64 -1)
+  // CHECK-NEXT:                        ret ptr getelementptr inbounds nuw (i8, ptr inttoptr (i64 -1 to ptr), i64 -1)
   static char *const base = (char *)-1;
   static const unsigned long offset = -1;
 #line 1600
@@ -461,7 +461,7 @@ char *void_ptr(void *base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
diff --git a/clang/test/CodeGen/catch-pointer-overflow-volatile.c b/clang/test/CodeGen/catch-pointer-overflow-volatile.c
index 4b0653a0ae59e..626bbc0db7afb 100644
--- a/clang/test/CodeGen/catch-pointer-overflow-volatile.c
+++ b/clang/test/CodeGen/catch-pointer-overflow-volatile.c
@@ -23,7 +23,7 @@ char *volatile_ptr(char *volatile base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load volatile ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
diff --git a/clang/test/CodeGen/catch-pointer-overflow.c b/clang/test/CodeGen/catch-pointer-overflow.c
index 899af73bd81e0..1f7f1729098c7 100644
--- a/clang/test/CodeGen/catch-pointer-overflow.c
+++ b/clang/test/CodeGen/catch-pointer-overflow.c
@@ -30,7 +30,7 @@ char *add_unsigned(char *base, unsigned long offset) {
   // CHECK-NEXT:                        store i64 %[[OFFSET]], ptr %[[OFFSET_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, ptr %[[OFFSET_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i64 %[[OFFSET_RELOADED]]
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_AGGREGATE:.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 %[[OFFSET_RELOADED]]), !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_OFFSET_OVERFLOWED:.*]] = extractvalue { i64, i1 } %[[COMPUTED_OFFSET_AGGREGATE]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[OR_OV:.+]] = or i1 %[[COMPUTED_OFFSET_OVERFLOWED]], false, !nosanitize
@@ -179,7 +179,7 @@ char *postinc(char *base) {
   // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i32 1
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i32 1
   // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
@@ -241,7 +241,7 @@ char *preinc(char *base) {
   // CHECK-NEXT:                        %[[BASE_ADDR:.*]] = alloca ptr, align 8
   // CHECK-NEXT:                        store ptr %[[BASE]], ptr %[[BASE_ADDR]], align 8
   // CHECK-NEXT:                        %[[BASE_RELOADED:.*]] = load ptr, ptr %[[BASE_ADDR]], align 8
-  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[BASE_RELOADED]], i32 1
+  // CHECK-NEXT:                        %[[ADD_PTR:.*]] = getelementptr inbounds nuw i8, ptr %[[BASE_RELOADED]], i32 1
   // CHECK-SANITIZE-NEXT:               %[[BASE_RELOADED_INT:.*]] = ptrtoint ptr %[[BASE_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[COMPUTED_GEP:.*]] = add i64 %[[BASE_RELOADED_INT]], 1, !nosanitize
   // CHECK-SANITIZE-NEXT:               %[[BASE_IS_NOT_NULLPTR:.*]] = icmp ne ptr %[[BASE_RELOADED]], null, !nosanitize
diff --git a/clang/test/CodeGen/ext-int.c b/clang/test/CodeGen/ext-int.c
index 714b7e122a706..e3d609a4ba4a2 100644
--- a/clang/test/CodeGen/ext-int.c
+++ b/clang/test/CodeGen/ext-int.c
@@ -154,7 +154,7 @@ _BitInt(129) *f1(_BitInt(129) *p) {
 }
 
 char *f2(char *p) {
-  // CHECK64: getelementptr inbounds i8, {{.*}} i64 24
+  // CHECK64: getelementptr inbounds nuw i8, {{.*}} i64 24
   return p + sizeof(_BitInt(129));
 }
 
diff --git a/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c b/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
index 7802168de4d75..d25d1e04f15fa 100644
--- a/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
+++ b/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
@@ -6,9 +6,9 @@
 // the return value will be the value in A[2]
 // CHECK: @brev_ptr_inc
 // CHECK-DAG: llvm.hexagon.L2.loadri.pbr
-// CHECK-DAG: getelementptr inbounds i8, {{.*}}i32 4
-// CHECK-NOT: getelementptr inbounds i8, {{.*}}i32 8
-// CHECK-NOT: getelementptr inbounds i8, {{.*}}i32 4
+// CHECK-DAG: getelementptr inbounds nuw i8, {{.*}}i32 4
+// CHECK-NOT: getelementptr inbounds nuw i8, {{.*}}i32 8
+// CHECK-NOT: getelementptr inbounds nuw i8, {{.*}}i32 4
 int brev_ptr_inc(int A[], int B[]) {
   int *p0 = &B[0];
   int *p1 = &A[0];
diff --git a/clang/test/CodeGen/integer-overflow.c b/clang/test/CodeGen/integer-overflow.c
index 461b026d39615..9e8cde8b33b16 100644
--- a/clang/test/CodeGen/integer-overflow.c
+++ b/clang/test/CodeGen/integer-overflow.c
@@ -60,10 +60,10 @@ void test1(void) {
   // -fwrapv should turn off inbounds for GEP's, PR9256
   extern int* P;
   ++P;
-  // DEFAULT: getelementptr inbounds i32, ptr
+  // DEFAULT: getelementptr inbounds nuw i32, ptr
   // WRAPV: getelementptr i32, ptr
-  // TRAPV: getelementptr inbounds i32, ptr
-  // CATCH_UB_POINTER: getelementptr inbounds i32, ptr
+  // TRAPV: getelementptr inbounds nuw i32, ptr
+  // CATCH_UB_POINTER: getelementptr inbounds nuw i32, ptr
   // NOCATCH_UB_POINTER: getelementptr i32, ptr
 
   // PR9350: char pre-increment never overflows.
diff --git a/clang/test/CodeGen/ms-intrinsics.c b/clang/test/CodeGen/ms-intrinsics.c
index c3d64fda0b901..459a708d9b2e0 100644
--- a/clang/test/CodeGen/ms-intrinsics.c
+++ b/clang/test/CodeGen/ms-intrinsics.c
@@ -156,7 +156,7 @@ unsigned char test_BitScanForward(unsigned long *Index, unsigned long Mask) {
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr %Index, {{i64|i32}} 4
+// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds nuw i8, ptr %Index, {{i64|i32}} 4
 // CHECK:   [[INDEX:%[0-9]+]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %Mask, i1 true)
 // CHECK:   store i32 [[INDEX]], ptr [[IDXGEP]], align 4
 // CHECK:   br label %[[END_LABEL]]
@@ -171,7 +171,7 @@ unsigned char test_BitScanReverse(unsigned long *Index, unsigned long Mask) {
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr %Index, {{i64|i32}} 4
+// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds nuw i8, ptr %Index, {{i64|i32}} 4
 // CHECK:   [[REVINDEX:%[0-9]+]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %Mask, i1 true)
 // CHECK:   [[INDEX:%[0-9]+]] = xor i32 [[REVINDEX]], 31
 // CHECK:   store i32 [[INDEX]], ptr [[IDXGEP]], align 4
@@ -437,10 +437,10 @@ unsigned char test_InterlockedCompareExchange128(
                                         ++ExchangeLow, ++ComparandResult);
 }
 // CHECK-64: define{{.*}}i8 @test_InterlockedCompareExchange128(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, ptr{{[a-z_ ]*}}%ComparandResult){{.*}}{
-// CHECK-64: %incdec.ptr = getelementptr inbounds i8, ptr %Destination, i64 8
+// CHECK-64: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Destination, i64 8
 // CHECK-64: %inc = add nsw i64 %ExchangeHigh, 1
 // CHECK-64: %inc1 = add nsw i64 %ExchangeLow, 1
-// CHECK-64: %incdec.ptr2 = getelementptr inbounds i8, ptr %ComparandResult, i64 8
+// CHECK-64: %incdec.ptr2 = getelementptr inbounds nuw i8, ptr %ComparandResult, i64 8
 // CHECK-64: [[EH:%[0-9]+]] = zext i64 %inc to i128
 // CHECK-64: [[EL:%[0-9]+]] = zext i64 %inc1 to i128
 // CHECK-64: [[EHS:%[0-9]+]] = shl nuw i128 [[EH]], 64
@@ -486,7 +486,7 @@ short test_InterlockedIncrement16(short volatile *Addend) {
   return _InterlockedIncrement16(++Addend);
 }
 // CHECK: define{{.*}}i16 @test_InterlockedIncrement16(ptr{{[a-z_ ]*}}%Addend){{.*}}{
-// CHECK: %incdec.ptr = getelementptr inbounds i8, ptr %Addend, {{i64|i32}} 2
+// CHECK: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Addend, {{i64|i32}} 2
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i16 1 seq_cst, align 2
 // CHECK: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1
 // CHECK: ret i16 [[RESULT]]
@@ -496,7 +496,7 @@ long test_InterlockedIncrement(long volatile *Addend) {
   return _InterlockedIncrement(++Addend);
 }
 // CHECK: define{{.*}}i32 @test_InterlockedIncrement(ptr{{[a-z_ ]*}}%Addend){{.*}}{
-// CHECK: %incdec.ptr = getelementptr inbounds i8, ptr %Addend, {{i64|i32}} 4
+// CHECK: %incdec.ptr = getelementptr inbounds nuw i8, ptr %Addend, {{i64|i32}} 4
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i32 1 seq_cst, align 4
 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK: ret i32 [[RESULT]]
diff --git a/clang/test/CodeGen/ubsan-pointer-overflow.m b/clang/test/CodeGen/ubsan-pointer-overflow.m
index 9192598da92fc..4ecdac655669f 100644
--- a/clang/test/CodeGen/ubsan-pointer-overflow.m
+++ b/clang/test/CodeGen/ubsan-pointer-overflow.m
@@ -5,7 +5,7 @@ void variable_len_array_arith(int n, int k) {
   int vla[n];
   int (*p)[n] = &vla;
 
-  // CHECK: getelementptr inbounds i32, ptr {{.*}}, i64 [[INC:%.*]]
+  // CHECK: getelementptr inbounds nuw i32, ptr {{.*}}, i64 [[INC:%.*]]
   // CHECK: @llvm.smul.with.overflow.i64(i64 4, i64 [[INC]]), !nosanitize
   // CHECK-NOT: select
   // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}}
diff --git a/clang/test/CodeGen/vla.c b/clang/test/CodeGen/vla.c
index 33621c5dd7a29..a22ba727df2fe 100644
--- a/clang/test/CodeGen/vla.c
+++ b/clang/test/CodeGen/vla.c
@@ -120,7 +120,7 @@ int test4(unsigned n, char (*p)[n][n+1][6]) {
   // CHECK-NEXT: [[T2:%.*]] = udiv i32 [[T1]], 2
   // CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[DIM0]], [[DIM1]]
   // CHECK-NEXT: [[T4:%.*]] = mul nsw i32 [[T2]], [[T3]]
-  // CHECK-NEXT: [[T5:%.*]] = getelementptr inbounds [6 x i8], ptr [[T0]], i32 [[T4]]
+  // CHECK-NEXT: [[T5:%.*]] = getelementptr inbounds nuw [6 x i8], ptr [[T0]], i32 [[T4]]
   // CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[N]], align 4
   // CHECK-NEXT: [[T7:%.*]] = udiv i32 [[T6]], 4
   // CHECK-NEXT: [[T8:%.*]] = sub i32 0, [[T7]]
diff --git a/clang/test/CodeGenCXX/always_destroy.cpp b/clang/test/CodeGenCXX/always_destroy.cpp
index e84c4cf02c52f..ca8a8e0cabacb 100644
--- a/clang/test/CodeGenCXX/always_destroy.cpp
+++ b/clang/test/CodeGenCXX/always_destroy.cpp
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 %s -fno-c++-static-destructors -emit-llvm -triple x86_64-apple-macosx10.13.0 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -fc++-static-destructors=none -emit-llvm -triple x86_64-apple-macosx10.13.0 -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,NO-DTORS %s
+// RUN: %clang_cc1 %s -fc++-static-destructors=thread-local -emit-llvm -triple x86_64-apple-macosx10.13.0 -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,THREAD-LOCAL-DTORS %s
 
 struct NonTrivial {
   ~NonTrivial();
@@ -6,7 +9,8 @@ struct NonTrivial {
 
 // CHECK-NOT: __cxa_atexit{{.*}}_ZN10NonTrivialD1Ev
 NonTrivial nt1;
-// CHECK-NOT: _tlv_atexit{{.*}}_ZN10NonTrivialD1Ev
+// NO-DTORS-NOT: _tlv_atexit{{.*}}_ZN10NonTrivialD1Ev
+// THREAD-LOCAL-DTORS: _tlv_atexit{{.*}}_ZN10NonTrivialD1Ev
 thread_local NonTrivial nt2;
 
 struct NonTrivial2 {
diff --git a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
index e842de6335046..fd9786de3a949 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-iteration-stmt.cpp
@@ -152,16 +152,16 @@ void f_branch_elided()
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0
-// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16:![0-9]+]]
+// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4
-// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 true)
 // CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -172,16 +172,16 @@ void f_branch_elided()
 // CHECK-NEXT:    br label [[FOR_END:%.*]]
 // CHECK:       for.body:
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 1
+// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK:       for.end:
 // CHECK-NEXT:    ret void
 //
@@ -204,16 +204,16 @@ void frl(int (&&e) [4])
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__BEGIN1]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP1]], i64 0, i64 0
-// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[__END1]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP2]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[ARRAYDECAY1]], i64 4
-// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    store ptr [[ADD_PTR]], ptr [[__END1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK:       for.cond:
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__END1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[TMP3]], [[TMP4]]
 // CHECK-NEXT:    [[CMP_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CMP]], i1 false)
 // CHECK-NEXT:    br i1 [[CMP_EXPVAL]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
@@ -224,16 +224,16 @@ void frl(int (&&e) [4])
 // CHECK-NEXT:    br label [[FOR_END:%.*]]
 // CHECK:       for.body:
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR3]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    store i32 [[TMP6]], ptr [[I]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR3]]
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA16]]
-// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 1
+// CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8, !tbaa [[TBAA14]]
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK:       for.end:
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenCXX/attr-no-destroy-d54344.cpp b/clang/test/CodeGenCXX/attr-no-destroy-d54344.cpp
index b03791e5135df..053043adb61c1 100644
--- a/clang/test/CodeGenCXX/attr-no-destroy-d54344.cpp
+++ b/clang/test/CodeGenCXX/attr-no-destroy-d54344.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -std=c++2a -emit-llvm -O0 -triple x86_64-unknown-linux-gnu -DNOATTR  %s -o - | FileCheck %s
 // RUN: %clang_cc1 -std=c++2a -emit-llvm -O0 -triple x86_64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-ATTR
-// RUN: %clang_cc1 -std=c++2a -emit-llvm -O0 -triple x86_64-unknown-linux-gnu -DNOATTR -fno-c++-static-destructors %s -o - | FileCheck %s --check-prefix=CHECK-FLAG
+// RUN: %clang_cc1 -std=c++2a -emit-llvm -O0 -triple x86_64-unknown-linux-gnu -DNOATTR -fc++-static-destructors=none %s -o - | FileCheck %s --check-prefix=CHECK-FLAG
+// RUN: %clang_cc1 -std=c++2a -emit-llvm -O0 -triple x86_64-unknown-linux-gnu -DNOATTR -fc++-static-destructors=thread-local %s -o - | FileCheck %s --check-prefix=CHECK-FLAG
 
 // Regression test for D54344. Class with no user-defined destructor
 // that has an inherited member that has a non-trivial destructor
diff --git a/clang/test/CodeGenCXX/for-range.cpp b/clang/test/CodeGenCXX/for-range.cpp
index 10d27206d12e4..088a34647c374 100644
--- a/clang/test/CodeGenCXX/for-range.cpp
+++ b/clang/test/CodeGenCXX/for-range.cpp
@@ -33,7 +33,7 @@ B *end(C&);
 
 extern B array[5];
 
-// CHECK-LABEL: define {{[^@]+}}@_Z9for_arrayv(
+// CHECK-LABEL: @_Z9for_arrayv(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1
 // CHECK-NEXT:    [[__RANGE1:%.*]] = alloca ptr, align 8
@@ -57,7 +57,7 @@ extern B array[5];
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__BEGIN1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP3]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP3]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8
 // CHECK-NEXT:    br label [[FOR_COND]]
 // CHECK:       for.end:
@@ -70,7 +70,7 @@ void for_array() {
   }
 }
 
-// CHECK-LABEL: define {{[^@]+}}@_Z9for_rangev(
+// CHECK-LABEL: @_Z9for_rangev(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1
 // CHECK-NEXT:    [[__RANGE1:%.*]] = alloca ptr, align 8
@@ -103,7 +103,7 @@ void for_array() {
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP5]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP5]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8
 // CHECK-NEXT:    br label [[FOR_COND]]
 // CHECK:       for.end:
@@ -116,7 +116,7 @@ void for_range() {
   }
 }
 
-// CHECK-LABEL: define {{[^@]+}}@_Z16for_member_rangev(
+// CHECK-LABEL: @_Z16for_member_rangev(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 1
 // CHECK-NEXT:    [[__RANGE1:%.*]] = alloca ptr, align 8
@@ -149,7 +149,7 @@ void for_range() {
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK:       for.inc:
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__BEGIN1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds [[STRUCT_B]], ptr [[TMP5]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[TMP5]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN1]], align 8
 // CHECK-NEXT:    br label [[FOR_COND]]
 // CHECK:       for.end:
diff --git a/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp b/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp
index f7df110ec0129..bcb2d875dce66 100644
--- a/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp
+++ b/clang/test/CodeGenCXX/pr45964-decomp-transform.cpp
@@ -16,7 +16,7 @@ void (*d)(){test_transform<0>};
 // CHECK-NEXT:  [[BODY]]:
 // CHECK-NEXT:  [[CUR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[BODY]] ]
 // CHECK-NEXT:  [[DEST:%.*]] = getelementptr inbounds i32, ptr [[BEGIN]], i64 [[CUR]]
-// CHECK-NEXT:  [[SRC:%.*]] = getelementptr inbounds [1 x i32], ptr @a, i64 0, i64 [[CUR]]
+// CHECK-NEXT:  [[SRC:%.*]] = getelementptr inbounds nuw [1 x i32], ptr @a, i64 0, i64 [[CUR]]
 // CHECK-NEXT:  [[X:%.*]] = load i32, ptr [[SRC]]
 // CHECK-NEXT:  store i32 [[X]], ptr [[DEST]]
 // CHECK-NEXT:  [[NEXT]] = add nuw i64 [[CUR]], 1
diff --git a/clang/test/CodeGenCXX/vla.cpp b/clang/test/CodeGenCXX/vla.cpp
index 4cf2b3b445b40..aadf51fce3a44 100644
--- a/clang/test/CodeGenCXX/vla.cpp
+++ b/clang/test/CodeGenCXX/vla.cpp
@@ -83,7 +83,7 @@ void test2(int b) {
   
   //CHECK: [[VLA_SIZEOF:%.*]] = mul nuw i64 4, [[VLA_NUM_ELEMENTS_PRE]]
   //CHECK-NEXT: [[VLA_NUM_ELEMENTS_POST:%.*]] = udiv i64 [[VLA_SIZEOF]], 4
-  //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds i32, ptr {{%.*}}, i64 [[VLA_NUM_ELEMENTS_POST]]
+  //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds nuw i32, ptr {{%.*}}, i64 [[VLA_NUM_ELEMENTS_POST]]
   //X64-NEXT: store ptr [[VLA_END_PTR]], ptr %__end1
   //AMDGCN-NEXT: store ptr [[VLA_END_PTR]], ptr [[END]]
   for (int d : varr) 0;
@@ -116,7 +116,7 @@ void test3(int b, int c) {
   //CHECK-NEXT: [[VLA_SIZEOF_DIM2:%.*]] = mul nuw i64 4, [[VLA_DIM2_PRE]]
   //CHECK-NEXT: [[VLA_NUM_ELEMENTS:%.*]] = udiv i64 [[VLA_SIZEOF]], [[VLA_SIZEOF_DIM2]]
   //CHECK-NEXT: [[VLA_END_INDEX:%.*]] = mul nsw i64 [[VLA_NUM_ELEMENTS]], [[VLA_DIM2_PRE]]
-  //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds i32, ptr {{%.*}}, i64 [[VLA_END_INDEX]]
+  //CHECK-NEXT: [[VLA_END_PTR:%.*]] = getelementptr inbounds nuw i32, ptr {{%.*}}, i64 [[VLA_END_INDEX]]
   //X64-NEXT: store ptr [[VLA_END_PTR]], ptr %__end
   //AMDGCN-NEXT: store ptr [[VLA_END_PTR]], ptr [[END]]
  
diff --git a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
index f5556df30871c..02e570ebdcb4f 100644
--- a/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
+++ b/clang/test/CodeGenHLSL/buffer-array-operator.hlsl
@@ -17,7 +17,7 @@ void fn(int Idx) {
 // CHECK-NEXT: %h = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %this1, i32 0, i32 0
 // CHECK-NEXT: %0 = load ptr, ptr %h, align 4
 // CHECK-NEXT: %1 = load i32, ptr %Idx.addr, align 4
-// CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %0, i32 %1
+// CHECK-NEXT: %arrayidx = getelementptr inbounds nuw float, ptr %0, i32 %1
 // CHECK-NEXT: ret ptr %arrayidx
 
 // Const comes next, and returns the pointer instead of the value.
@@ -26,5 +26,5 @@ void fn(int Idx) {
 // CHECK-NEXT: %h = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %this1, i32 0, i32 0
 // CHECK-NEXT: %0 = load ptr, ptr %h, align 4
 // CHECK-NEXT: %1 = load i32, ptr %Idx.addr, align 4
-// CHECK-NEXT: %arrayidx = getelementptr inbounds float, ptr %0, i32 %1
+// CHECK-NEXT: %arrayidx = getelementptr inbounds nuw float, ptr %0, i32 %1
 // CHECK-NEXT: ret ptr %arrayidx
diff --git a/clang/test/CodeGenSYCL/address-space-deduction.cpp b/clang/test/CodeGenSYCL/address-space-deduction.cpp
index 96075a47343fe..5910ec3bfc305 100644
--- a/clang/test/CodeGenSYCL/address-space-deduction.cpp
+++ b/clang/test/CodeGenSYCL/address-space-deduction.cpp
@@ -33,55 +33,55 @@
 // CHECK-NEXT:    store ptr addrspace(4) [[I_ASCAST]], ptr addrspace(4) [[PPTR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PPTR_ASCAST]], align 8
 // CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(4) [[TMP0]], [[I_ASCAST]]
-// CHECK-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[CMP]] to i8
-// CHECK-NEXT:    store i8 [[FROMBOOL]], ptr addrspace(4) [[IS_I_PTR_ASCAST]], align 1
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[CMP]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr addrspace(4) [[IS_I_PTR_ASCAST]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PPTR_ASCAST]], align 8
 // CHECK-NEXT:    store i32 66, ptr addrspace(4) [[TMP1]], align 4
 // CHECK-NEXT:    store i32 23, ptr addrspace(4) [[VAR23_ASCAST]], align 4
 // CHECK-NEXT:    store ptr addrspace(4) [[VAR23_ASCAST]], ptr addrspace(4) [[CP_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CP_ASCAST]], align 8
-// CHECK-NEXT:    store i8 41, ptr addrspace(4) [[TMP3]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CP_ASCAST]], align 8
+// CHECK-NEXT:    store i8 41, ptr addrspace(4) [[TMP2]], align 1
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0
 // CHECK-NEXT:    store ptr addrspace(4) [[ARRAYDECAY]], ptr addrspace(4) [[CPP_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CPP_ASCAST]], align 8
-// CHECK-NEXT:    store i8 43, ptr addrspace(4) [[TMP5]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[CPP_ASCAST]], align 8
+// CHECK-NEXT:    store i8 43, ptr addrspace(4) [[TMP3]], align 1
 // CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[ARRAYDECAY1]], i64 10
 // CHECK-NEXT:    store ptr addrspace(4) [[ADD_PTR]], ptr addrspace(4) [[APTR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8
 // CHECK-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [42 x i32], ptr addrspace(4) [[ARR_ASCAST]], i64 0, i64 0
-// CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[ARRAYDECAY2]], i64 168
-// CHECK-NEXT:    [[CMP4:%.*]] = icmp ult ptr addrspace(4) [[TMP6]], [[ADD_PTR3]]
+// CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[ARRAYDECAY2]], i64 168
+// CHECK-NEXT:    [[CMP4:%.*]] = icmp ult ptr addrspace(4) [[TMP4]], [[ADD_PTR3]]
 // CHECK-NEXT:    br i1 [[CMP4]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 // CHECK:       if.then:
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8
-// CHECK-NEXT:    store i32 44, ptr addrspace(4) [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[APTR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 44, ptr addrspace(4) [[TMP5]], align 4
 // CHECK-NEXT:    br label [[IF_END]]
 // CHECK:       if.end:
 // CHECK-NEXT:    store ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str to ptr addrspace(4)), ptr addrspace(4) [[STR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP8]], i64 0
-// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(4) [[ARRAYIDX]], align 1
-// CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP9]] to i32
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(4) [[ARRAYIDX]], align 1
+// CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
 // CHECK-NEXT:    store i32 [[CONV]], ptr addrspace(4) [[I_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4
-// CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP10]], 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4
+// CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], 2
 // CHECK-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK:       cond.true:
-// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
 // CHECK-NEXT:    br label [[COND_END:%.*]]
 // CHECK:       cond.false:
 // CHECK-NEXT:    br label [[COND_END]]
 // CHECK:       cond.end:
-// CHECK-NEXT:    [[COND:%.*]] = phi ptr addrspace(4) [ [[TMP11]], [[COND_TRUE]] ], [ addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), [[COND_FALSE]] ]
+// CHECK-NEXT:    [[COND:%.*]] = phi ptr addrspace(4) [ [[TMP9]], [[COND_TRUE]] ], [ addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), [[COND_FALSE]] ]
 // CHECK-NEXT:    store ptr addrspace(4) [[COND]], ptr addrspace(4) [[PHI_STR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4
-// CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], 2
-// CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[CMP6]] to i64
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(4) [[I_ASCAST]], align 4
+// CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP10]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[CMP6]] to i64
 // CHECK-NEXT:    [[COND7:%.*]] = select i1 [[CMP6]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.2 to ptr addrspace(4)), ptr addrspace(4) null
 // CHECK-NEXT:    store ptr addrspace(4) [[COND7]], ptr addrspace(4) [[SELECT_NULL_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
-// CHECK-NEXT:    store ptr addrspace(4) [[TMP14]], ptr addrspace(4) [[SELECT_STR_TRIVIAL1_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[STR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(4) [[TMP12]], ptr addrspace(4) [[SELECT_STR_TRIVIAL1_ASCAST]], align 8
 // CHECK-NEXT:    store ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), ptr addrspace(4) [[SELECT_STR_TRIVIAL2_ASCAST]], align 8
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/Driver/cxx-static-destructors.cpp b/clang/test/Driver/cxx-static-destructors.cpp
new file mode 100644
index 0000000000000..a12a292a387bd
--- /dev/null
+++ b/clang/test/Driver/cxx-static-destructors.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang -### -c -fc++-static-destructors=all %s 2>&1 | FileCheck --check-prefix ALL %s
+// RUN: %clang -### -c -fc++-static-destructors %s 2>&1 | FileCheck --check-prefix ALL %s
+// RUN: %clang -### -c -fno-c++-static-destructors -fc++-static-destructors %s 2>&1 | FileCheck --check-prefix ALL %s
+// RUN: %clang -### -c -fc++-static-destructors=none %s 2>&1 | FileCheck --check-prefix NONE %s
+// RUN: %clang -### -c -fno-c++-static-destructors %s 2>&1 | FileCheck --check-prefix NONE %s
+// RUN: %clang -### -c -fc++-static-destructors -fno-c++-static-destructors %s 2>&1 | FileCheck --check-prefix NONE %s
+// RUN: %clang -### -c -fc++-static-destructors=thread-local %s 2>&1 | FileCheck --check-prefix THREAD-LOCAL %s
+
+// ALL: -fc++-static-destructors=all
+// NONE: -fc++-static-destructors=none
+// THREAD-LOCAL: -fc++-static-destructors=thread-local
diff --git a/clang/test/Driver/offload-packager.c b/clang/test/Driver/offload-packager.c
index 9adc202322521..fb5f1006cf9e2 100644
--- a/clang/test/Driver/offload-packager.c
+++ b/clang/test/Driver/offload-packager.c
@@ -3,63 +3,64 @@
 // REQUIRES: amdgpu-registered-target
 // UNSUPPORTED: system-windows
 
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t/elf.o
 
 // Check that we can extract files from the packaged binary.
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_80 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90c 
-// RUN: clang-offload-packager %t.out \
-// RUN:   --image=file=%t-sm_70.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t-gfx908.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
-// RUN: diff %t-sm_70.o %t.elf.o
-// RUN: diff %t-gfx908.o %t.elf.o
+// RUN: clang-offload-packager -o %t/package.out \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_80 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90c 
+// RUN: clang-offload-packager %t/package.out \
+// RUN:   --image=file=%t/sm_70.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
+// RUN:   --image=file=%t/gfx908.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: diff %t/sm_70.o %t/elf.o
+// RUN: diff %t/gfx908.o %t/elf.o
 
 // Check that we generate a new name if one is not given
-// RUN: clang-offload-packager -o %t \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_80 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a \
-// RUN:   --image=file=%t.elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90c 
-// RUN: cd $(dirname "%t") && clang-offload-packager %t --image=kind=openmp
-// RUN: diff *-nvptx64-nvidia-cuda-sm_70.0.o %t.elf.o; rm *-nvptx64-nvidia-cuda-sm_70.0.o
-// RUN: diff *-nvptx64-nvidia-cuda-sm_80.1.o %t.elf.o; rm *-nvptx64-nvidia-cuda-sm_80.1.o
-// RUN: diff *-amdgcn-amd-amdhsa-gfx908.2.o %t.elf.o; rm *-amdgcn-amd-amdhsa-gfx908.2.o
-// RUN: diff *-amdgcn-amd-amdhsa-gfx90a.3.o %t.elf.o; rm *-amdgcn-amd-amdhsa-gfx90a.3.o
-// RUN: not diff *-amdgcn-amd-amdhsa-gfx90c.4.o %t.elf.o
+// RUN: clang-offload-packager -o %t/package \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_80 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a \
+// RUN:   --image=file=%t/elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90c 
+// RUN: clang-offload-packager %t/package --image=kind=openmp
+// RUN: diff *-nvptx64-nvidia-cuda-sm_70.0.o %t/elf.o; rm *-nvptx64-nvidia-cuda-sm_70.0.o
+// RUN: diff *-nvptx64-nvidia-cuda-sm_80.1.o %t/elf.o; rm *-nvptx64-nvidia-cuda-sm_80.1.o
+// RUN: diff *-amdgcn-amd-amdhsa-gfx908.2.o %t/elf.o; rm *-amdgcn-amd-amdhsa-gfx908.2.o
+// RUN: diff *-amdgcn-amd-amdhsa-gfx90a.3.o %t/elf.o; rm *-amdgcn-amd-amdhsa-gfx90a.3.o
+// RUN: not diff *-amdgcn-amd-amdhsa-gfx90c.4.o %t/elf.o
 
 // Check that we can extract from an ELF object file
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-offload-packager %t.out \
-// RUN:   --image=file=%t-sm_70.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t-gfx908.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
-// RUN: diff %t-sm_70.o %t.elf.o
-// RUN: diff %t-gfx908.o %t.elf.o
+// RUN: clang-offload-packager -o %t/package.out \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t/package.o -fembed-offload-object=%t/package.out
+// RUN: clang-offload-packager %t/package.out \
+// RUN:   --image=file=%t/sm_70.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
+// RUN:   --image=file=%t/gfx908.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: diff %t/sm_70.o %t/elf.o
+// RUN: diff %t/gfx908.o %t/elf.o
 
 // Check that we can extract from a bitcode file
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -o %t.bc -fembed-offload-object=%t.out
-// RUN: clang-offload-packager %t.out \
-// RUN:   --image=file=%t-sm_70.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t-gfx908.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
-// RUN: diff %t-sm_70.o %t.elf.o
-// RUN: diff %t-gfx908.o %t.elf.o
+// RUN: clang-offload-packager -o %t/package.out \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -o %t/package.bc -fembed-offload-object=%t/package.out
+// RUN: clang-offload-packager %t/package.out \
+// RUN:   --image=file=%t/sm_70.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
+// RUN:   --image=file=%t/gfx908.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: diff %t/sm_70.o %t/elf.o
+// RUN: diff %t/gfx908.o %t/elf.o
 
 // Check that we can extract from an archive file to an archive file.
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager %t.a --archive --image=file=%t-gfx908.a,arch=gfx908
-// RUN: llvm-ar t %t-gfx908.a 2>&1 | FileCheck %s
+// RUN: clang-offload-packager -o %t/package.out \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
+// RUN:   --image=file=%t/elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t/package.o -fembed-offload-object=%t/package.out
+// RUN: llvm-ar rcs %t/package.a %t/package.o
+// RUN: clang-offload-packager %t/package.a --archive --image=file=%t/gfx908.a,arch=gfx908
+// RUN: llvm-ar t %t/gfx908.a 2>&1 | FileCheck %s
 // CHECK: {{.*}}.o
diff --git a/clang/test/ExtractAPI/bool.c b/clang/test/ExtractAPI/bool.c
index efab6dfeef03b..7a4d34acb0d26 100644
--- a/clang/test/ExtractAPI/bool.c
+++ b/clang/test/ExtractAPI/bool.c
@@ -2,8 +2,8 @@
 // RUN: split-file %s %t
 // RUN: sed -e "s@INPUT_DIR@%{/t:regex_replacement}@g" \
 // RUN: %t/reference.output.json.in >> %t/reference.output.json
-// RUN: %clang -extract-api --pretty-sgf -target arm64-apple-macosx \
-// RUN: %t/input.h -o %t/output.json
+// RUN: %clang_cc1 -extract-api --product-name=BoolTest --pretty-sgf -triple arm64-apple-macosx \
+// RUN:   %t/input.h -o %t/output.json
 
 // Generator version is not consistent across test runs, normalize it.
 // RUN: sed -e "s@\"generator\": \".*\"@\"generator\": \"?\"@g" \
@@ -15,7 +15,7 @@
 bool Foo;
 
 bool IsFoo(bool Bar);
-/// expected-no-diagnostics
+// expected-no-diagnostics
 
 //--- reference.output.json.in
 {
@@ -28,7 +28,7 @@ bool IsFoo(bool Bar);
     "generator": "?"
   },
   "module": {
-    "name": "",
+    "name": "BoolTest",
     "platform": {
       "architecture": "arm64",
       "operatingSystem": {
diff --git a/clang/test/ExtractAPI/emit-symbol-graph/multi_file.c b/clang/test/ExtractAPI/emit-symbol-graph/multi_file.c
index e668f69bc7e05..651e0df2cd93a 100644
--- a/clang/test/ExtractAPI/emit-symbol-graph/multi_file.c
+++ b/clang/test/ExtractAPI/emit-symbol-graph/multi_file.c
@@ -27,9 +27,6 @@
 #ifndef TEST_H
 #define TEST_H
 
-#define testmarcro1 32
-#define testmacro2 42
-
 int testfunc (int param1, int param2);
 void testfunc2 ();
 #endif /* TEST_H */
@@ -185,7 +182,7 @@ int main ()
       "location": {
         "position": {
           "character": 4,
-          "line": 6
+          "line": 3
         },
         "uri": "file://INPUT_DIR/test.h"
       },
@@ -249,7 +246,7 @@ int main ()
       "location": {
         "position": {
           "character": 5,
-          "line": 7
+          "line": 4
         },
         "uri": "file://INPUT_DIR/test.h"
       },
@@ -335,106 +332,6 @@ int main ()
       "pathComponents": [
         "main"
       ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "testmarcro1"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:test.h@39@macro@testmarcro1"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 3
-        },
-        "uri": "file://INPUT_DIR/test.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "testmarcro1"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "testmarcro1"
-          }
-        ],
-        "title": "testmarcro1"
-      },
-      "pathComponents": [
-        "testmarcro1"
-      ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "testmacro2"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:test.h@62@macro@testmacro2"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 4
-        },
-        "uri": "file://INPUT_DIR/test.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "testmacro2"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "testmacro2"
-          }
-        ],
-        "title": "testmacro2"
-      },
-      "pathComponents": [
-        "testmacro2"
-      ]
     }
   ]
 }
@@ -573,7 +470,7 @@ int main ()
       "location": {
         "position": {
           "character": 4,
-          "line": 6
+          "line": 3
         },
         "uri": "file://INPUT_DIR/test.h"
       },
@@ -637,7 +534,7 @@ int main ()
       "location": {
         "position": {
           "character": 5,
-          "line": 7
+          "line": 4
         },
         "uri": "file://INPUT_DIR/test.h"
       },
@@ -659,106 +556,6 @@ int main ()
       "pathComponents": [
         "testfunc2"
       ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "testmarcro1"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:test.h@39@macro@testmarcro1"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 3
-        },
-        "uri": "file://INPUT_DIR/test.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "testmarcro1"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "testmarcro1"
-          }
-        ],
-        "title": "testmarcro1"
-      },
-      "pathComponents": [
-        "testmarcro1"
-      ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "testmacro2"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:test.h@62@macro@testmacro2"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 4
-        },
-        "uri": "file://INPUT_DIR/test.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "testmacro2"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "testmacro2"
-          }
-        ],
-        "title": "testmacro2"
-      },
-      "pathComponents": [
-        "testmacro2"
-      ]
     }
   ]
 }
diff --git a/clang/test/ExtractAPI/emit-symbol-graph/single_file.c b/clang/test/ExtractAPI/emit-symbol-graph/single_file.c
index b00b5f5237c9a..feb759f5947bc 100644
--- a/clang/test/ExtractAPI/emit-symbol-graph/single_file.c
+++ b/clang/test/ExtractAPI/emit-symbol-graph/single_file.c
@@ -15,9 +15,6 @@
 // CHECK-NOT: warning:
 
 //--- main.c
-#define TESTMACRO1 2
-#define TESTMARCRO2 5
-
 int main ()
 {
   return 0;
@@ -87,7 +84,7 @@ int main ()
       "location": {
         "position": {
           "character": 4,
-          "line": 3
+          "line": 0
         },
         "uri": "file://INPUT_DIR/main.c"
       },
@@ -109,106 +106,6 @@ int main ()
       "pathComponents": [
         "main"
       ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "TESTMACRO1"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:main.c@8@macro@TESTMACRO1"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 0
-        },
-        "uri": "file://INPUT_DIR/main.c"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "TESTMACRO1"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "TESTMACRO1"
-          }
-        ],
-        "title": "TESTMACRO1"
-      },
-      "pathComponents": [
-        "TESTMACRO1"
-      ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "TESTMARCRO2"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:main.c@29@macro@TESTMARCRO2"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 1
-        },
-        "uri": "file://INPUT_DIR/main.c"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "TESTMARCRO2"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "TESTMARCRO2"
-          }
-        ],
-        "title": "TESTMARCRO2"
-      },
-      "pathComponents": [
-        "TESTMARCRO2"
-      ]
     }
   ]
 }
diff --git a/clang/test/ExtractAPI/macros.c b/clang/test/ExtractAPI/macros.c
index 10003fe6f6e40..15eb5f6a7f66f 100644
--- a/clang/test/ExtractAPI/macros.c
+++ b/clang/test/ExtractAPI/macros.c
@@ -1,416 +1,358 @@
 // RUN: rm -rf %t
-// RUN: split-file %s %t
-// RUN: sed -e "s@INPUT_DIR@%{/t:regex_replacement}@g" \
-// RUN: %t/reference.output.json.in >> %t/reference.output.json
-// RUN: %clang -extract-api --pretty-sgf --product-name=Macros -target arm64-apple-macosx \
-// RUN: -x objective-c-header %t/input.h -o %t/output.json | FileCheck -allow-empty %s
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing --product-name=Macros -triple arm64-apple-macosx \
+// RUN:   -isystem %S -x objective-c-header %s -o %t/output.symbols.json
 
-// Generator version is not consistent across test runs, normalize it.
-// RUN: sed -e "s@\"generator\": \".*\"@\"generator\": \"?\"@g" \
-// RUN: %t/output.json >> %t/output-normalized.json
-// RUN: diff %t/reference.output.json %t/output-normalized.json
-
-// CHECK-NOT: error:
-// CHECK-NOT: warning:
-
-//--- input.h
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix HELLO
 #define HELLO 1
+// HELLO-LABEL: "!testLabel": "c:@macro@HELLO"
+// HELLO:      "accessLevel": "public",
+// HELLO-NEXT: "declarationFragments": [
+// HELLO-NEXT:   {
+// HELLO-NEXT:     "kind": "keyword",
+// HELLO-NEXT:     "spelling": "#define"
+// HELLO-NEXT:   },
+// HELLO-NEXT:   {
+// HELLO-NEXT:     "kind": "text",
+// HELLO-NEXT:     "spelling": " "
+// HELLO-NEXT:   },
+// HELLO-NEXT:   {
+// HELLO-NEXT:     "kind": "identifier",
+// HELLO-NEXT:     "spelling": "HELLO"
+// HELLO-NEXT:   }
+// HELLO-NEXT: ],
+// HELLO:      "kind": {
+// HELLO-NEXT:   "displayName": "Macro",
+// HELLO-NEXT:   "identifier": "objective-c.macro"
+// HELLO-NEXT: },
+// HELLO-NEXT: "location": {
+// HELLO-NEXT:   "position": {
+// HELLO-NEXT:     "character": 8,
+// HELLO-NEXT:     "line": [[# @LINE - 25]]
+// HELLO-NEXT:   },
+// HELLO-NEXT:   "uri": "file://{{.*}}/macros.c"
+// HELLO-NEXT: },
+// HELLO-NEXT: "names": {
+// HELLO-NEXT:   "navigator": [
+// HELLO-NEXT:     {
+// HELLO-NEXT:       "kind": "identifier",
+// HELLO-NEXT:       "spelling": "HELLO"
+// HELLO-NEXT:     }
+// HELLO-NEXT:   ],
+// HELLO-NEXT:   "subHeading": [
+// HELLO-NEXT:     {
+// HELLO-NEXT:       "kind": "identifier",
+// HELLO-NEXT:       "spelling": "HELLO"
+// HELLO-NEXT:     }
+// HELLO-NEXT:   ],
+// HELLO-NEXT:   "title": "HELLO"
+// HELLO-NEXT: },
+// HELLO-NEXT: "pathComponents": [
+// HELLO-NEXT:   "HELLO"
+// HELLO-NEXT: ]
+
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix WORLD
 #define WORLD 2
+// WORLD-LABEL: "!testLabel": "c:@macro@WORLD"
+// WORLD:      "accessLevel": "public",
+// WORLD-NEXT: "declarationFragments": [
+// WORLD-NEXT:   {
+// WORLD-NEXT:     "kind": "keyword",
+// WORLD-NEXT:     "spelling": "#define"
+// WORLD-NEXT:   },
+// WORLD-NEXT:   {
+// WORLD-NEXT:     "kind": "text",
+// WORLD-NEXT:     "spelling": " "
+// WORLD-NEXT:   },
+// WORLD-NEXT:   {
+// WORLD-NEXT:     "kind": "identifier",
+// WORLD-NEXT:     "spelling": "WORLD"
+// WORLD-NEXT:   }
+// WORLD-NEXT: ],
+// WORLD:      "kind": {
+// WORLD-NEXT:   "displayName": "Macro",
+// WORLD-NEXT:   "identifier": "objective-c.macro"
+// WORLD-NEXT: },
+// WORLD-NEXT: "location": {
+// WORLD-NEXT:   "position": {
+// WORLD-NEXT:     "character": 8,
+// WORLD-NEXT:     "line": [[# @LINE - 25]]
+// WORLD-NEXT:   },
+// WORLD-NEXT:   "uri": "file://{{.*}}/macros.c"
+// WORLD-NEXT: },
+// WORLD-NEXT: "names": {
+// WORLD-NEXT:   "navigator": [
+// WORLD-NEXT:     {
+// WORLD-NEXT:       "kind": "identifier",
+// WORLD-NEXT:       "spelling": "WORLD"
+// WORLD-NEXT:     }
+// WORLD-NEXT:   ],
+// WORLD-NEXT:   "subHeading": [
+// WORLD-NEXT:     {
+// WORLD-NEXT:       "kind": "identifier",
+// WORLD-NEXT:       "spelling": "WORLD"
+// WORLD-NEXT:     }
+// WORLD-NEXT:   ],
+// WORLD-NEXT:   "title": "WORLD"
+// WORLD-NEXT: },
+// WORLD-NEXT: "pathComponents": [
+// WORLD-NEXT:   "WORLD"
+// WORLD-NEXT: ]
+
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix MACRO_FUN
 #define MACRO_FUN(x) x x
+// MACRO_FUN-LABEL: "!testLabel": "c:@macro@MACRO_FUN"
+// MACRO_FUN-NEXT: "accessLevel": "public",
+// MACRO_FUN-NEXT: "declarationFragments": [
+// MACRO_FUN-NEXT:   {
+// MACRO_FUN-NEXT:     "kind": "keyword",
+// MACRO_FUN-NEXT:     "spelling": "#define"
+// MACRO_FUN-NEXT:   },
+// MACRO_FUN-NEXT:   {
+// MACRO_FUN-NEXT:     "kind": "text",
+// MACRO_FUN-NEXT:     "spelling": " "
+// MACRO_FUN-NEXT:   },
+// MACRO_FUN-NEXT:   {
+// MACRO_FUN-NEXT:     "kind": "identifier",
+// MACRO_FUN-NEXT:     "spelling": "MACRO_FUN"
+// MACRO_FUN-NEXT:   },
+// MACRO_FUN-NEXT:   {
+// MACRO_FUN-NEXT:     "kind": "text",
+// MACRO_FUN-NEXT:     "spelling": "("
+// MACRO_FUN-NEXT:   },
+// MACRO_FUN-NEXT:   {
+// MACRO_FUN-NEXT:     "kind": "internalParam",
+// MACRO_FUN-NEXT:     "spelling": "x"
+// MACRO_FUN-NEXT:   },
+// MACRO_FUN-NEXT:   {
+// MACRO_FUN-NEXT:     "kind": "text",
+// MACRO_FUN-NEXT:     "spelling": ")"
+// MACRO_FUN-NEXT:   }
+// MACRO_FUN-NEXT: ],
+// MACRO_FUN:      "kind": {
+// MACRO_FUN-NEXT:   "displayName": "Macro",
+// MACRO_FUN-NEXT:   "identifier": "objective-c.macro"
+// MACRO_FUN-NEXT: },
+// MACRO_FUN-NEXT: "location": {
+// MACRO_FUN-NEXT:   "position": {
+// MACRO_FUN-NEXT:     "character": 8,
+// MACRO_FUN-NEXT:     "line": [[# @LINE - 37]]
+// MACRO_FUN-NEXT:   },
+// MACRO_FUN-NEXT:   "uri": "file://{{.*}}/macros.c"
+// MACRO_FUN-NEXT: },
+// MACRO_FUN-NEXT: "names": {
+// MACRO_FUN-NEXT:   "navigator": [
+// MACRO_FUN-NEXT:     {
+// MACRO_FUN-NEXT:       "kind": "identifier",
+// MACRO_FUN-NEXT:       "spelling": "MACRO_FUN"
+// MACRO_FUN-NEXT:     }
+// MACRO_FUN-NEXT:   ],
+// MACRO_FUN-NEXT:   "subHeading": [
+// MACRO_FUN-NEXT:     {
+// MACRO_FUN-NEXT:       "kind": "identifier",
+// MACRO_FUN-NEXT:       "spelling": "MACRO_FUN"
+// MACRO_FUN-NEXT:     }
+// MACRO_FUN-NEXT:   ],
+// MACRO_FUN-NEXT:   "title": "MACRO_FUN"
+// MACRO_FUN-NEXT: },
+// MACRO_FUN-NEXT: "pathComponents": [
+// MACRO_FUN-NEXT:   "MACRO_FUN"
+// MACRO_FUN-NEXT: ]
+
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix FUN
 #define FUN(x, y, z) x + y + z
+// FUN-LABEL: "!testLabel": "c:@macro@FUN"
+// FUN-NEXT: "accessLevel": "public",
+// FUN-NEXT: "declarationFragments": [
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "keyword",
+// FUN-NEXT:     "spelling": "#define"
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "text",
+// FUN-NEXT:     "spelling": " "
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "identifier",
+// FUN-NEXT:     "spelling": "FUN"
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "text",
+// FUN-NEXT:     "spelling": "("
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "internalParam",
+// FUN-NEXT:     "spelling": "x"
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "text",
+// FUN-NEXT:     "spelling": ", "
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "internalParam",
+// FUN-NEXT:     "spelling": "y"
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "text",
+// FUN-NEXT:     "spelling": ", "
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "internalParam",
+// FUN-NEXT:     "spelling": "z"
+// FUN-NEXT:   },
+// FUN-NEXT:   {
+// FUN-NEXT:     "kind": "text",
+// FUN-NEXT:     "spelling": ")"
+// FUN-NEXT:   }
+// FUN-NEXT: ],
+// FUN:      "kind": {
+// FUN-NEXT:   "displayName": "Macro",
+// FUN-NEXT:   "identifier": "objective-c.macro"
+// FUN-NEXT: },
+// FUN-NEXT: "location": {
+// FUN-NEXT:   "position": {
+// FUN-NEXT:     "character": 8,
+// FUN-NEXT:     "line": [[# @LINE - 53]]
+// FUN-NEXT:   },
+// FUN-NEXT:   "uri": "file://{{.*}}/macros.c"
+// FUN-NEXT: },
+// FUN-NEXT: "names": {
+// FUN-NEXT:   "navigator": [
+// FUN-NEXT:     {
+// FUN-NEXT:       "kind": "identifier",
+// FUN-NEXT:       "spelling": "FUN"
+// FUN-NEXT:     }
+// FUN-NEXT:   ],
+// FUN-NEXT:   "subHeading": [
+// FUN-NEXT:     {
+// FUN-NEXT:       "kind": "identifier",
+// FUN-NEXT:       "spelling": "FUN"
+// FUN-NEXT:     }
+// FUN-NEXT:   ],
+// FUN-NEXT:   "title": "FUN"
+// FUN-NEXT: },
+// FUN-NEXT: "pathComponents": [
+// FUN-NEXT:   "FUN"
+// FUN-NEXT: ]
+
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix FUNC99
 #define FUNC99(x, ...)
+// FUNC99-LABEL: "!testLabel": "c:@macro@FUNC99"
+// FUNC99-NEXT: "accessLevel": "public",
+// FUNC99-NEXT: "declarationFragments": [
+// FUNC99-NEXT:   {
+// FUNC99-NEXT:     "kind": "keyword",
+// FUNC99-NEXT:     "spelling": "#define"
+// FUNC99-NEXT:   },
+// FUNC99-NEXT:   {
+// FUNC99-NEXT:     "kind": "text",
+// FUNC99-NEXT:     "spelling": " "
+// FUNC99-NEXT:   },
+// FUNC99-NEXT:   {
+// FUNC99-NEXT:     "kind": "identifier",
+// FUNC99-NEXT:     "spelling": "FUNC99"
+// FUNC99-NEXT:   },
+// FUNC99-NEXT:   {
+// FUNC99-NEXT:     "kind": "text",
+// FUNC99-NEXT:     "spelling": "("
+// FUNC99-NEXT:   },
+// FUNC99-NEXT:   {
+// FUNC99-NEXT:     "kind": "internalParam",
+// FUNC99-NEXT:     "spelling": "x"
+// FUNC99-NEXT:   },
+// FUNC99-NEXT:   {
+// FUNC99-NEXT:     "kind": "text",
+// FUNC99-NEXT:     "spelling": ", ...)"
+// FUNC99-NEXT:   }
+// FUNC99-NEXT: ],
+// FUNC99:      "kind": {
+// FUNC99-NEXT:   "displayName": "Macro",
+// FUNC99-NEXT:   "identifier": "objective-c.macro"
+// FUNC99-NEXT: },
+// FUNC99-NEXT: "location": {
+// FUNC99-NEXT:   "position": {
+// FUNC99-NEXT:     "character": 8,
+// FUNC99-NEXT:     "line": [[# @LINE - 37]]
+// FUNC99-NEXT:   },
+// FUNC99-NEXT:   "uri": "file://{{.*}}/macros.c"
+// FUNC99-NEXT: },
+// FUNC99-NEXT: "names": {
+// FUNC99-NEXT:   "navigator": [
+// FUNC99-NEXT:     {
+// FUNC99-NEXT:       "kind": "identifier",
+// FUNC99-NEXT:       "spelling": "FUNC99"
+// FUNC99-NEXT:     }
+// FUNC99-NEXT:   ],
+// FUNC99-NEXT:   "subHeading": [
+// FUNC99-NEXT:     {
+// FUNC99-NEXT:       "kind": "identifier",
+// FUNC99-NEXT:       "spelling": "FUNC99"
+// FUNC99-NEXT:     }
+// FUNC99-NEXT:   ],
+// FUNC99-NEXT:   "title": "FUNC99"
+// FUNC99-NEXT: },
+// FUNC99-NEXT: "pathComponents": [
+// FUNC99-NEXT:   "FUNC99"
+// FUNC99-NEXT: ]
+
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix FUNGNU
 #define FUNGNU(x...)
+// FUNGNU-LABEL: "!testLabel": "c:@macro@FUNGNU"
+// FUNGNU-NEXT: "accessLevel": "public",
+// FUNGNU-NEXT: "declarationFragments": [
+// FUNGNU-NEXT:   {
+// FUNGNU-NEXT:     "kind": "keyword",
+// FUNGNU-NEXT:     "spelling": "#define"
+// FUNGNU-NEXT:   },
+// FUNGNU-NEXT:   {
+// FUNGNU-NEXT:     "kind": "text",
+// FUNGNU-NEXT:     "spelling": " "
+// FUNGNU-NEXT:   },
+// FUNGNU-NEXT:   {
+// FUNGNU-NEXT:     "kind": "identifier",
+// FUNGNU-NEXT:     "spelling": "FUNGNU"
+// FUNGNU-NEXT:   },
+// FUNGNU-NEXT:   {
+// FUNGNU-NEXT:     "kind": "text",
+// FUNGNU-NEXT:     "spelling": "("
+// FUNGNU-NEXT:   },
+// FUNGNU-NEXT:   {
+// FUNGNU-NEXT:     "kind": "internalParam",
+// FUNGNU-NEXT:     "spelling": "x"
+// FUNGNU-NEXT:   },
+// FUNGNU-NEXT:   {
+// FUNGNU-NEXT:     "kind": "text",
+// FUNGNU-NEXT:     "spelling": "...)"
+// FUNGNU-NEXT:   }
+// FUNGNU-NEXT: ],
+// FUNGNU:      "kind": {
+// FUNGNU-NEXT:   "displayName": "Macro",
+// FUNGNU-NEXT:   "identifier": "objective-c.macro"
+// FUNGNU-NEXT: },
+// FUNGNU-NEXT: "location": {
+// FUNGNU-NEXT:   "position": {
+// FUNGNU-NEXT:     "character": 8,
+// FUNGNU-NEXT:     "line": [[# @LINE - 37]]
+// FUNGNU-NEXT:   },
+// FUNGNU-NEXT:   "uri": "file://{{.*}}/macros.c"
+// FUNGNU-NEXT: },
+// FUNGNU-NEXT: "names": {
+// FUNGNU-NEXT:   "navigator": [
+// FUNGNU-NEXT:     {
+// FUNGNU-NEXT:       "kind": "identifier",
+// FUNGNU-NEXT:       "spelling": "FUNGNU"
+// FUNGNU-NEXT:     }
+// FUNGNU-NEXT:   ],
+// FUNGNU-NEXT:   "subHeading": [
+// FUNGNU-NEXT:     {
+// FUNGNU-NEXT:       "kind": "identifier",
+// FUNGNU-NEXT:       "spelling": "FUNGNU"
+// FUNGNU-NEXT:     }
+// FUNGNU-NEXT:   ],
+// FUNGNU-NEXT:   "title": "FUNGNU"
+// FUNGNU-NEXT: },
+// FUNGNU-NEXT: "pathComponents": [
+// FUNGNU-NEXT:   "FUNGNU"
+// FUNGNU-NEXT: ]
+
+// expected-no-diagnostics
 
-//--- reference.output.json.in
-{
-  "metadata": {
-    "formatVersion": {
-      "major": 0,
-      "minor": 5,
-      "patch": 3
-    },
-    "generator": "?"
-  },
-  "module": {
-    "name": "Macros",
-    "platform": {
-      "architecture": "arm64",
-      "operatingSystem": {
-        "minimumVersion": {
-          "major": 11,
-          "minor": 0,
-          "patch": 0
-        },
-        "name": "macosx"
-      },
-      "vendor": "apple"
-    }
-  },
-  "relationships": [],
-  "symbols": [
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "HELLO"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "objective-c",
-        "precise": "c:input.h@8@macro@HELLO"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "objective-c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 0
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "HELLO"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "HELLO"
-          }
-        ],
-        "title": "HELLO"
-      },
-      "pathComponents": [
-        "HELLO"
-      ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "WORLD"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "objective-c",
-        "precise": "c:input.h@24@macro@WORLD"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "objective-c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 1
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "WORLD"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "WORLD"
-          }
-        ],
-        "title": "WORLD"
-      },
-      "pathComponents": [
-        "WORLD"
-      ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "MACRO_FUN"
-        },
-        {
-          "kind": "text",
-          "spelling": "("
-        },
-        {
-          "kind": "internalParam",
-          "spelling": "x"
-        },
-        {
-          "kind": "text",
-          "spelling": ")"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "objective-c",
-        "precise": "c:input.h@40@macro@MACRO_FUN"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "objective-c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 2
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "MACRO_FUN"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "MACRO_FUN"
-          }
-        ],
-        "title": "MACRO_FUN"
-      },
-      "pathComponents": [
-        "MACRO_FUN"
-      ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "FUN"
-        },
-        {
-          "kind": "text",
-          "spelling": "("
-        },
-        {
-          "kind": "internalParam",
-          "spelling": "x"
-        },
-        {
-          "kind": "text",
-          "spelling": ", "
-        },
-        {
-          "kind": "internalParam",
-          "spelling": "y"
-        },
-        {
-          "kind": "text",
-          "spelling": ", "
-        },
-        {
-          "kind": "internalParam",
-          "spelling": "z"
-        },
-        {
-          "kind": "text",
-          "spelling": ")"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "objective-c",
-        "precise": "c:input.h@65@macro@FUN"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "objective-c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 3
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "FUN"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "FUN"
-          }
-        ],
-        "title": "FUN"
-      },
-      "pathComponents": [
-        "FUN"
-      ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "FUNC99"
-        },
-        {
-          "kind": "text",
-          "spelling": "("
-        },
-        {
-          "kind": "internalParam",
-          "spelling": "x"
-        },
-        {
-          "kind": "text",
-          "spelling": ", ...)"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "objective-c",
-        "precise": "c:input.h@96@macro@FUNC99"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "objective-c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 4
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "FUNC99"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "FUNC99"
-          }
-        ],
-        "title": "FUNC99"
-      },
-      "pathComponents": [
-        "FUNC99"
-      ]
-    },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "#define"
-        },
-        {
-          "kind": "text",
-          "spelling": " "
-        },
-        {
-          "kind": "identifier",
-          "spelling": "FUNGNU"
-        },
-        {
-          "kind": "text",
-          "spelling": "("
-        },
-        {
-          "kind": "internalParam",
-          "spelling": "x"
-        },
-        {
-          "kind": "text",
-          "spelling": "...)"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "objective-c",
-        "precise": "c:input.h@119@macro@FUNGNU"
-      },
-      "kind": {
-        "displayName": "Macro",
-        "identifier": "objective-c.macro"
-      },
-      "location": {
-        "position": {
-          "character": 8,
-          "line": 5
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "FUNGNU"
-          }
-        ],
-        "subHeading": [
-          {
-            "kind": "identifier",
-            "spelling": "FUNGNU"
-          }
-        ],
-        "title": "FUNGNU"
-      },
-      "pathComponents": [
-        "FUNGNU"
-      ]
-    }
-  ]
-}
diff --git a/clang/test/ExtractAPI/submodule-macro.m b/clang/test/ExtractAPI/submodule-macro.m
new file mode 100644
index 0000000000000..a41d59a157425
--- /dev/null
+++ b/clang/test/ExtractAPI/submodule-macro.m
@@ -0,0 +1,25 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN:   --emit-extension-symbol-graphs --symbol-graph-dir=%t/symbols -isystem %t \
+// RUN:   --product-name=Umbrella -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules-cache \
+// RUN:   -triple arm64-apple-macosx -x objective-c-header %t/Umbrella.h %t/Subheader.h
+
+//--- Umbrella.h
+#include "Subheader.h"
+#import <stdbool.h>
+
+//--- Subheader.h
+#define FOO 1
+
+//--- module.modulemap
+module Umbrella {
+    umbrella header "Umbrella.h"
+    export *
+    module * { export * }
+}
+
+// RUN: FileCheck %s --input-file  %t/symbols/Umbrella.symbols.json --check-prefix MOD
+// MOD-NOT: bool
+// MOD: "!testLabel": "c:@macro@FOO"
+
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 6ee10976f1207..bc0c3796ca0f6 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -47,7 +47,7 @@ typedef unsigned long long uint64_t;
 // CHECK-NEXT:    [[CONV5_I:%.*]] = zext nneg i8 [[TMP0]] to i64
 // CHECK-NEXT:    [[ADD_I:%.*]] = add i64 [[MUL_I]], -48
 // CHECK-NEXT:    [[SUB_I:%.*]] = add i64 [[ADD_I]], [[CONV5_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I]]
 // CHECK:       cleanup.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ], [ [[__TAGP_ADDR_0_I]], [[WHILE_BODY_I]] ]
@@ -79,7 +79,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) {
 // CHECK-NEXT:    [[CONV5_I:%.*]] = zext nneg i8 [[TMP0]] to i64
 // CHECK-NEXT:    [[ADD_I:%.*]] = add i64 [[MUL_I]], -48
 // CHECK-NEXT:    [[SUB_I:%.*]] = add i64 [[ADD_I]], [[CONV5_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I]]
 // CHECK:       cleanup.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ], [ [[__TAGP_ADDR_0_I]], [[WHILE_BODY_I]] ]
@@ -120,7 +120,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) {
 // CHECK-NEXT:    [[CONV25_I:%.*]] = zext nneg i8 [[TMP0]] to i64
 // CHECK-NEXT:    [[ADD26_I:%.*]] = add i64 [[MUL24_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I:%.*]] = add i64 [[ADD26_I]], [[CONV25_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I]]
 // CHECK:       cleanup.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I]] = phi ptr [ [[INCDEC_PTR_I]], [[IF_END31_I]] ], [ [[__TAGP_ADDR_0_I]], [[IF_ELSE17_I]] ]
@@ -141,7 +141,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // CHECK-NEXT:    br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[WHILE_COND_I14_I:%.*]]
 // CHECK:       if.then.i:
-// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
 // CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I:%.*]] [
 // CHECK-NEXT:      i8 120, label [[WHILE_COND_I30_I_PREHEADER:%.*]]
@@ -173,7 +173,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[CONV25_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD26_I_I:%.*]] = add i64 [[MUL24_I_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I_I:%.*]] = add i64 [[ADD26_I_I]], [[CONV25_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I40_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I40_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I36_I]]
 // CHECK:       cleanup.i36.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I]] = phi ptr [ [[INCDEC_PTR_I40_I]], [[IF_END31_I_I]] ], [ [[__TAGP_ADDR_0_I31_I]], [[IF_ELSE17_I_I]] ]
@@ -195,7 +195,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[CONV5_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = add i64 [[MUL_I_I]], -48
 // CHECK-NEXT:    [[SUB_I_I:%.*]] = add i64 [[ADD_I_I]], [[CONV5_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I_I]]
 // CHECK:       cleanup.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I_I]] = phi ptr [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ], [ [[__TAGP_ADDR_0_I_I]], [[WHILE_BODY_I_I]] ]
@@ -216,7 +216,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[CONV5_I26_I:%.*]] = zext nneg i8 [[TMP8]] to i64
 // CHECK-NEXT:    [[ADD_I27_I:%.*]] = add i64 [[MUL_I25_I]], -48
 // CHECK-NEXT:    [[SUB_I28_I:%.*]] = add i64 [[ADD_I27_I]], [[CONV5_I26_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I29_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I29_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I20_I]]
 // CHECK:       cleanup.i20.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I]] = phi ptr [ [[INCDEC_PTR_I29_I]], [[IF_THEN_I24_I]] ], [ [[__TAGP_ADDR_0_I15_I]], [[WHILE_BODY_I18_I]] ]
@@ -2367,7 +2367,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // CHECK-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
 // CHECK:       if.then.i.i:
-// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
 // CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
 // CHECK-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
@@ -2399,7 +2399,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // CHECK-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I36_I_I]]
 // CHECK:       cleanup.i36.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
@@ -2421,7 +2421,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // CHECK-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
 // CHECK-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // CHECK-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I_I_I]]
 // CHECK:       cleanup.i.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
@@ -2442,7 +2442,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // CHECK-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
 // CHECK-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
 // CHECK-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I20_I_I]]
 // CHECK:       cleanup.i20.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
@@ -2466,7 +2466,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // CHECK-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
 // CHECK:       if.then.i.i:
-// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
 // CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
 // CHECK-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
@@ -2498,7 +2498,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // CHECK-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I36_I_I]]
 // CHECK:       cleanup.i36.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
@@ -2520,7 +2520,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // CHECK-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
 // CHECK-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
 // CHECK-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I_I_I]]
 // CHECK:       cleanup.i.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
@@ -2541,7 +2541,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // CHECK-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
 // CHECK-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
 // CHECK-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// CHECK-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I20_I_I]]
 // CHECK:       cleanup.i20.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
@@ -2862,7 +2862,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // DEFAULT:       _ZL5normfiPKf.exit:
@@ -2882,7 +2882,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // FINITEONLY:       _ZL5normfiPKf.exit:
@@ -2902,7 +2902,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // APPROX:       _ZL5normfiPKf.exit:
@@ -2926,7 +2926,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // DEFAULT:       _ZL4normiPKd.exit:
@@ -2946,7 +2946,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // FINITEONLY:       _ZL4normiPKd.exit:
@@ -2966,7 +2966,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // APPROX:       _ZL4normiPKd.exit:
@@ -3286,7 +3286,7 @@ extern "C" __device__ double test_rint(double x) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // DEFAULT:       _ZL6rnormfiPKf.exit:
@@ -3306,7 +3306,7 @@ extern "C" __device__ double test_rint(double x) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // FINITEONLY:       _ZL6rnormfiPKf.exit:
@@ -3326,7 +3326,7 @@ extern "C" __device__ double test_rint(double x) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 4
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // APPROX:       _ZL6rnormfiPKf.exit:
@@ -3350,7 +3350,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // DEFAULT:       _ZL5rnormiPKd.exit:
@@ -3370,7 +3370,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // FINITEONLY:       _ZL5rnormiPKd.exit:
@@ -3390,7 +3390,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__A_ADDR_0_I3]], i64 8
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // APPROX:       _ZL5rnormiPKd.exit:
diff --git a/clang/test/OpenMP/bug60602.cpp b/clang/test/OpenMP/bug60602.cpp
index cb2e4e5b11e33..0789ef958e523 100644
--- a/clang/test/OpenMP/bug60602.cpp
+++ b/clang/test/OpenMP/bug60602.cpp
@@ -58,13 +58,13 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i64 0
 // CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP8]] to i64
 // CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[CONV]], 4
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP11]], i64 0
 // CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP12]] to i64
 // CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[CONV2]], 4
@@ -134,13 +134,13 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP48]], i64 0
 // CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP49]] to i64
 // CHECK-NEXT:    [[TMP50:%.*]] = mul nuw i64 [[CONV5]], 4
 // CHECK-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP52]], i64 0
 // CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP53]] to i64
 // CHECK-NEXT:    [[TMP54:%.*]] = mul nuw i64 [[CONV7]], 4
diff --git a/clang/test/OpenMP/declare_mapper_codegen.cpp b/clang/test/OpenMP/declare_mapper_codegen.cpp
index 52d5ceffa1471..d2954b7a74821 100644
--- a/clang/test/OpenMP/declare_mapper_codegen.cpp
+++ b/clang/test/OpenMP/declare_mapper_codegen.cpp
@@ -129,7 +129,7 @@ class C {
 // CK0-DAG: [[BBEGIN:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1
 // CK0-DAG: [[BBEGIN2:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1
 // CK0-DAG: [[BARRBEGIN:%.+]] = load ptr, ptr [[BBEGIN2]]
-// CK0-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0
+// CK0-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds nuw double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0
 // CK0-DAG: [[BEND:%.+]] = getelementptr ptr, ptr [[BBEGIN]], i32 1
 // CK0-DAG: [[ABEGINI:%.+]] = ptrtoint ptr [[ABEGIN]] to i64
 // CK0-DAG: [[BENDI:%.+]] = ptrtoint ptr [[BEND]] to i64
@@ -965,7 +965,7 @@ class C {
 // CK4-DAG: [[BBEGIN:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1
 // CK4-DAG: [[BBEGIN2:%.+]] = getelementptr inbounds nuw %class.C, ptr [[PTR]], i32 0, i32 1
 // CK4-DAG: [[BARRBEGIN:%.+]] = load ptr, ptr [[BBEGIN2]]
-// CK4-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0
+// CK4-DAG: [[BARRBEGINGEP:%.+]] = getelementptr inbounds nuw double, ptr [[BARRBEGIN]], i[[sz:64|32]] 0
 // CK4-DAG: [[BEND:%.+]] = getelementptr ptr, ptr [[BBEGIN]], i32 1
 // CK4-DAG: [[ABEGINI:%.+]] = ptrtoint ptr [[ABEGIN]] to i64
 // CK4-DAG: [[BENDI:%.+]] = ptrtoint ptr [[BEND]] to i64
diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp
index ea619cb6e0f26..6c588ba25db30 100644
--- a/clang/test/OpenMP/distribute_codegen.cpp
+++ b/clang/test/OpenMP/distribute_codegen.cpp
@@ -662,24 +662,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -1574,21 +1574,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK3-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
@@ -2252,24 +2252,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK17-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK17-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK17-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK17-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK17-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK17-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
@@ -2790,21 +2790,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK19-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK19-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK19-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK19-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK19-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK19-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
index b019b4ff92ad5..93a6779ac02e8 100644
--- a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
@@ -175,16 +175,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -214,7 +214,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP23]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -229,19 +229,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9
 // CHECK1-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8
@@ -562,9 +562,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index f7353172e235c..ad93fd6030ac7 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -706,24 +706,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -1682,21 +1682,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK3-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
@@ -2664,24 +2664,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK5-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK5-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK5-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
@@ -3671,21 +3671,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK7-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK7-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK7-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK7-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK7-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK7-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK7-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK7-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK7-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK7-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK7-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK7-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
@@ -4290,24 +4290,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK9-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP5]] to i64
-// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[IDXPROM]]
+// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[IDXPROM]]
 // CHECK9-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP8]] to i64
-// CHECK9-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM1]]
+// CHECK9-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM1]]
 // CHECK9-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[MUL3:%.*]] = fmul float [[TMP6]], [[TMP9]]
 // CHECK9-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK9-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM4]]
+// CHECK9-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM4]]
 // CHECK9-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[MUL6:%.*]] = fmul float [[MUL3]], [[TMP12]]
 // CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK9-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM7]]
+// CHECK9-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM7]]
 // CHECK9-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK9-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK9:       omp.body.continue:
@@ -4606,21 +4606,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK11-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 [[TMP5]]
+// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 [[TMP5]]
 // CHECK11-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK11-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[TMP8]]
+// CHECK11-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 [[TMP8]]
 // CHECK11-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[MUL2:%.*]] = fmul float [[TMP6]], [[TMP9]]
 // CHECK11-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK11-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP11]]
+// CHECK11-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 [[TMP11]]
 // CHECK11-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[MUL4:%.*]] = fmul float [[MUL2]], [[TMP12]]
 // CHECK11-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK11-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]]
+// CHECK11-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 [[TMP14]]
 // CHECK11-NEXT:    store float [[MUL4]], ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK11-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK11:       omp.body.continue:
@@ -4928,24 +4928,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP5]] to i64
-// CHECK13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[IDXPROM]]
+// CHECK13-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[IDXPROM]]
 // CHECK13-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP8]] to i64
-// CHECK13-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM1]]
+// CHECK13-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM1]]
 // CHECK13-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[MUL3:%.*]] = fmul float [[TMP6]], [[TMP9]]
 // CHECK13-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK13-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM4]]
+// CHECK13-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM4]]
 // CHECK13-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[MUL6:%.*]] = fmul float [[MUL3]], [[TMP12]]
 // CHECK13-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK13-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM7]]
+// CHECK13-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM7]]
 // CHECK13-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK13-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK13:       omp.body.continue:
@@ -5275,21 +5275,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK15-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK15-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 [[TMP5]]
+// CHECK15-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 [[TMP5]]
 // CHECK15-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[C_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK15-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 [[TMP8]]
+// CHECK15-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 [[TMP8]]
 // CHECK15-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[MUL2:%.*]] = fmul float [[TMP6]], [[TMP9]]
 // CHECK15-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK15-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 [[TMP11]]
+// CHECK15-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 [[TMP11]]
 // CHECK15-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[MUL4:%.*]] = fmul float [[MUL2]], [[TMP12]]
 // CHECK15-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK15-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]]
+// CHECK15-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 [[TMP14]]
 // CHECK15-NEXT:    store float [[MUL4]], ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK15-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK15:       omp.body.continue:
@@ -5782,24 +5782,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK17-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK17-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK17-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK17-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK17-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK17-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
@@ -6373,21 +6373,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK19-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK19-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK19-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK19-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK19-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK19-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
@@ -6970,24 +6970,24 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK21-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK21-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK21-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK21-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK21-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK21-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK21-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK21-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK21-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK21-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK21-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK21-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
@@ -7592,21 +7592,21 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
+// CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i32 [[TMP15]]
 // CHECK23-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK23-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
+// CHECK23-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i32 [[TMP18]]
 // CHECK23-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK23-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK23-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
+// CHECK23-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i32 [[TMP21]]
 // CHECK23-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
 // CHECK23-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK23-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
+// CHECK23-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i32 [[TMP24]]
 // CHECK23-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
diff --git a/clang/test/OpenMP/for_linear_codegen.cpp b/clang/test/OpenMP/for_linear_codegen.cpp
index 395ccdbeed763..5a21fe8509fd3 100644
--- a/clang/test/OpenMP/for_linear_codegen.cpp
+++ b/clang/test/OpenMP/for_linear_codegen.cpp
@@ -650,7 +650,7 @@ int main() {
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP16]], [[MUL9]]
 // CHECK1-NEXT:    store i32 [[ADD10]], ptr [[LVAR5]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[PVAR4]], align 8
-// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 1
+// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 1
 // CHECK1-NEXT:    store ptr [[INCDEC_PTR]], ptr [[PVAR4]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP6]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
diff --git a/clang/test/OpenMP/for_reduction_codegen.cpp b/clang/test/OpenMP/for_reduction_codegen.cpp
index ea32e98bf1423..83632db238484 100644
--- a/clang/test/OpenMP/for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen.cpp
@@ -1021,14 +1021,14 @@ int main() {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP5]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0
 // CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP7]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP8]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -1054,16 +1054,16 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP17]], [[TMP18]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = sdiv exact i64 [[TMP19]], ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[VLA7]], i64 [[TMP20]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[ARRAYIDX8]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i64 1
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN11:%.*]] = add nsw i64 0, [[TMP23]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN11]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN11]]
 // CHECK1-NEXT:    [[ARRAYDECAY13:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[ARRAYIDX12]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDECAY13]], i64 2
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[ARRAYDECAY13]], i64 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK1-NEXT:    [[TMP25:%.*]] = ptrtoint ptr [[ARRAYIDX9]] to i64
 // CHECK1-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP24]], [[TMP25]]
@@ -1580,10 +1580,10 @@ int main() {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP3]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP4]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX4]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX4]], i64 1
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [2 x i32]], ptr [[ARR6]], i32 0, i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 2
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
@@ -1757,13 +1757,13 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 4
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6
 // CHECK1-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK1-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]]
@@ -1963,11 +1963,11 @@ int main() {
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [6 x %struct.S]], ptr [[VAR24]], i32 0, i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 6
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
@@ -2148,13 +2148,13 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 1
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 6
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 6
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [1 x [6 x %struct.S]], ptr [[VAR24]], i32 0, i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 6
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP5]]
@@ -2335,13 +2335,13 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 1
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 1
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[TMP4]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i64 1
 // CHECK1-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR24]])
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
@@ -2459,8 +2459,8 @@ int main() {
 // CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[TMP0]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[TMP0]], i64 0, i64 4
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [5 x %struct.S], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [5 x %struct.S], ptr [[TMP0]], i64 0, i64 4
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [5 x %struct.S], ptr [[VVAR22]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 5
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
@@ -2641,9 +2641,9 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 2
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 2
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[VAR34]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 2
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]]
@@ -2826,9 +2826,9 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[VAR34]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 2
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]]
@@ -3012,9 +3012,9 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP2]], i64 0, i64 2
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP2]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S], ptr [[TMP3]], i64 0, i64 3
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S], ptr [[TMP3]], i64 0, i64 3
 // CHECK1-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP4]], [[TMP5]]
@@ -3974,8 +3974,8 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 1
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 40
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [42 x %struct.S.0], ptr [[TMP0]], i64 0, i64 40
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [40 x %struct.S.0], ptr [[ARR4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 40
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP6]]
diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
index 16d6c23542fce..82f94c949eea6 100644
--- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -1074,14 +1074,14 @@ int main() {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP5]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX]], i64 0
 // CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP7]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = mul nsw i64 1, [[TMP1]]
 // CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP8]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX5]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX6]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -1109,16 +1109,16 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP17]], [[TMP18]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = sdiv exact i64 [[TMP19]], ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[VLA7]], i64 [[TMP20]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[ARRAYIDX9]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[ARRAYDECAY]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[ARRAYDECAY]], i64 1
 // CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN12:%.*]] = add nsw i64 0, [[TMP23]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN12]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw [10 x [4 x %struct.S.0]], ptr [[TMP4]], i64 0, i64 [[LB_ADD_LEN12]]
 // CHECK1-NEXT:    [[ARRAYDECAY14:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[ARRAYIDX13]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDECAY14]], i64 2
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[ARRAYDECAY14]], i64 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[ARRAYIDX15]] to i64
 // CHECK1-NEXT:    [[TMP25:%.*]] = ptrtoint ptr [[ARRAYIDX10]] to i64
 // CHECK1-NEXT:    [[TMP26:%.*]] = sub i64 [[TMP24]], [[TMP25]]
@@ -1669,13 +1669,13 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[TMP2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[TMP2]], i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 4
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[TMP4]], i64 6
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP4]], i64 6
 // CHECK1-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK1-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]]
@@ -1877,8 +1877,8 @@ int main() {
 // CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 4
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [5 x %struct.S.0], ptr [[TMP0]], i64 0, i64 4
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [5 x %struct.S.0], ptr [[VVAR22]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_S_0:%.*]], ptr [[ARRAY_BEGIN]], i64 5
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
@@ -2066,9 +2066,9 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[TMP2]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x %struct.S.0], ptr [[TMP2]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x %struct.S.0], ptr [[TMP3]], i64 0, i64 2
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [4 x %struct.S.0], ptr [[TMP3]], i64 0, i64 2
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[VAR34]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_S_0:%.*]], ptr [[ARRAY_BEGIN]], i64 2
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]]
@@ -2979,8 +2979,8 @@ int main() {
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [42 x %struct.S], ptr [[TMP0]], i64 0, i64 1
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [42 x %struct.S], ptr [[TMP0]], i64 0, i64 40
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [42 x %struct.S], ptr [[TMP0]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw [42 x %struct.S], ptr [[TMP0]], i64 0, i64 40
 // CHECK1-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [40 x %struct.S], ptr [[ARR4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr [[STRUCT_S:%.*]], ptr [[ARRAY_BEGIN]], i64 40
 // CHECK1-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP6]]
diff --git a/clang/test/OpenMP/for_reduction_task_codegen.cpp b/clang/test/OpenMP/for_reduction_task_codegen.cpp
index ea93323de77d0..b875279c2a144 100644
--- a/clang/test/OpenMP/for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_task_codegen.cpp
@@ -68,16 +68,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP5]]
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP6]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP6]], i64 9
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]]
@@ -107,7 +107,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP21]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP22]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP23]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -122,19 +122,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP28]], align 8
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP29]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP34]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP35]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP35]], i64 9
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP36]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP30]], align 8
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP37]], align 8
@@ -459,9 +459,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/for_scan_codegen.cpp b/clang/test/OpenMP/for_scan_codegen.cpp
index 4cf18a76fbfef..61e6534db471e 100644
--- a/clang/test/OpenMP/for_scan_codegen.cpp
+++ b/clang/test/OpenMP/for_scan_codegen.cpp
@@ -39,13 +39,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:.+]]
@@ -72,13 +72,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -132,13 +132,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE:[^,]+]]
@@ -179,13 +179,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
@@ -217,13 +217,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -280,13 +280,13 @@ void baz(int n) {
     // CHECK: [[IF_THEN]]:
     // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE]]
diff --git a/clang/test/OpenMP/for_simd_scan_codegen.cpp b/clang/test/OpenMP/for_simd_scan_codegen.cpp
index 29af5f74c5b5b..829f2656042fb 100644
--- a/clang/test/OpenMP/for_simd_scan_codegen.cpp
+++ b/clang/test/OpenMP/for_simd_scan_codegen.cpp
@@ -39,13 +39,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:.+]]
@@ -72,13 +72,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -132,13 +132,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE:[^,]+]]
@@ -179,13 +179,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
@@ -217,13 +217,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -280,13 +280,13 @@ void baz(int n) {
     // CHECK: [[IF_THEN]]:
     // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE]]
diff --git a/clang/test/OpenMP/irbuilder_for_iterator.cpp b/clang/test/OpenMP/irbuilder_for_iterator.cpp
index 0098a7db575c3..ec1c3af744b49 100644
--- a/clang/test/OpenMP/irbuilder_for_iterator.cpp
+++ b/clang/test/OpenMP/irbuilder_for_iterator.cpp
@@ -78,18 +78,18 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) {
 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP8]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP9]], [[TMP12]]
 // CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_rangefor.cpp b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
index 45b34621afbb9..86a043e638bc3 100644
--- a/clang/test/OpenMP/irbuilder_for_rangefor.cpp
+++ b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
@@ -94,18 +94,18 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) {
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP12]], [[TMP15]]
 // CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned.c b/clang/test/OpenMP/irbuilder_for_unsigned.c
index b0043b823ac85..675871a87b3bd 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned.c
@@ -65,24 +65,24 @@ extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) {
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP11]], [[TMP14]]
 // CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP16]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP17]]
 // CHECK-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP19]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP18]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_auto.c b/clang/test/OpenMP/irbuilder_for_unsigned_auto.c
index 19b2770bfa2df..39ede3ef971d0 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_auto.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_auto.c
@@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_auto(float *a, float *b, float *c, float
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_down.c b/clang/test/OpenMP/irbuilder_for_unsigned_down.c
index 6e179826a6efa..5515f086c34a7 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_down.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_down.c
@@ -67,7 +67,7 @@ extern "C" void workshareloop_unsigned_down(float *a) {
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM]]
 // CHECK-NEXT:    store float [[CONV]], ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
index 8f3297061938b..f20b60e608d2f 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic.c
@@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_dynamic(float *a, float *b, float *c, flo
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
index c2b0948bf7aeb..599f256243b11 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_dynamic_chunked.c
@@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_dynamic_chunked(float *a, float *b, float
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c b/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
index 68becf9f694ad..c27bcba155910 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_runtime.c
@@ -66,24 +66,24 @@ extern "C" void workshareloop_unsigned_runtime(float *a, float *b, float *c, flo
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], [[TMP8]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP10]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP11]]
 // CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c b/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
index 71fb6b5473da8..b937568ca9f11 100644
--- a/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
+++ b/clang/test/OpenMP/irbuilder_for_unsigned_static_chunked.c
@@ -108,24 +108,24 @@ extern "C" void workshareloop_unsigned_static_chunked(float *a, float *b, float
 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM2]]
 // CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP19]], [[TMP22]]
 // CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    [[MUL6:%.*]] = fmul float [[MUL]], [[TMP25]]
 // CHECK-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[I]], align 4
 // CHECK-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP27]] to i64
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[IDXPROM7]]
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP26]], i64 [[IDXPROM7]]
 // CHECK-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX8]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
diff --git a/clang/test/OpenMP/map_struct_ordering.cpp b/clang/test/OpenMP/map_struct_ordering.cpp
index d5b22d8ff2a4d..a52ddad465f37 100644
--- a/clang/test/OpenMP/map_struct_ordering.cpp
+++ b/clang/test/OpenMP/map_struct_ordering.cpp
@@ -57,7 +57,7 @@ int map_struct() {
 // CHECK-NEXT:    [[DATUM:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 0, i32 0
 // CHECK-NEXT:    [[DATUM2:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DATUM2]], align 8
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_DESCRIPTOR]], ptr [[DAT]], i32 1
 // CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 // CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[DAT]] to i64
diff --git a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
index e90f0783787c0..7d467293d0c8f 100644
--- a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
@@ -76,7 +76,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
index 180ff3a94d24c..b0652c843845c 100644
--- a/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
@@ -84,9 +84,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -138,10 +138,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
index 2061da7c4d781..b0d00c5f539b1 100644
--- a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
@@ -76,7 +76,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
index a69844dc4dee2..7def61251b24e 100644
--- a/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
@@ -80,9 +80,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -134,10 +134,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp
index 0a73eaefc9808..67285cfaef34d 100644
--- a/clang/test/OpenMP/ordered_codegen.cpp
+++ b/clang/test/OpenMP/ordered_codegen.cpp
@@ -255,21 +255,21 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP7]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP7]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[TMP10]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 // CHECK1-NEXT:    [[MUL3:%.*]] = fmul float [[TMP8]], [[TMP11]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP13]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP14]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[TMP16]]
 // CHECK1-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4
 // CHECK1-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
@@ -485,24 +485,24 @@ void foo_simd(int low, int up) {
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP9]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-NEXT:    [[IDXPROM7:%.*]] = zext i8 [[TMP12]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM7]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM7]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK1-NEXT:    [[MUL9:%.*]] = fmul float [[TMP10]], [[TMP13]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-NEXT:    [[IDXPROM10:%.*]] = zext i8 [[TMP15]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM10]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM10]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 // CHECK1-NEXT:    [[MUL12:%.*]] = fmul float [[MUL9]], [[TMP16]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-NEXT:    [[IDXPROM13:%.*]] = zext i8 [[TMP18]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM13]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM13]]
 // CHECK1-NEXT:    store float [[MUL12]], ptr [[ARRAYIDX14]], align 4
 // CHECK1-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
@@ -866,21 +866,21 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 // CHECK1-IRBUILDER-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[TMP6]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP6]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP9]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX8]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
@@ -1110,24 +1110,24 @@ void foo_simd(int low, int up) {
 // CHECK1-IRBUILDER-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM9]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM9]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-IRBUILDER-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM12]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM12]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
 // CHECK1-IRBUILDER-NEXT:    [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]]
 // CHECK1-IRBUILDER-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1
 // CHECK1-IRBUILDER-NEXT:    [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM15]]
+// CHECK1-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM15]]
 // CHECK1-IRBUILDER-NEXT:    store float [[MUL14]], ptr [[ARRAYIDX16]], align 4
 // CHECK1-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK1-IRBUILDER:       omp.inner.for.body.ordered.after:
@@ -1495,21 +1495,21 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP7]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP7]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[TMP10]]
+// CHECK3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP9]], i64 [[TMP10]]
 // CHECK3-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 // CHECK3-NEXT:    [[MUL3:%.*]] = fmul float [[TMP8]], [[TMP11]]
 // CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP13]]
+// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP13]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK3-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP14]]
 // CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP16:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i64 [[TMP16]]
+// CHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i64 [[TMP16]]
 // CHECK3-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4
 // CHECK3-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
@@ -1725,24 +1725,24 @@ void foo_simd(int low, int up) {
 // CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP9]] to i64
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP12:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-NEXT:    [[IDXPROM7:%.*]] = zext i8 [[TMP12]] to i64
-// CHECK3-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM7]]
+// CHECK3-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM7]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK3-NEXT:    [[MUL9:%.*]] = fmul float [[TMP10]], [[TMP13]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-NEXT:    [[IDXPROM10:%.*]] = zext i8 [[TMP15]] to i64
-// CHECK3-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM10]]
+// CHECK3-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM10]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
 // CHECK3-NEXT:    [[MUL12:%.*]] = fmul float [[MUL9]], [[TMP16]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-NEXT:    [[IDXPROM13:%.*]] = zext i8 [[TMP18]] to i64
-// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM13]]
+// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM13]]
 // CHECK3-NEXT:    store float [[MUL12]], ptr [[ARRAYIDX14]], align 4
 // CHECK3-NEXT:    call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
@@ -2106,21 +2106,21 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
 // CHECK3-IRBUILDER-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP6:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[TMP6]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP6]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[TMP9]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP9]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[MUL5:%.*]] = fmul float [[TMP7]], [[TMP10]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[MUL7:%.*]] = fmul float [[MUL5]], [[TMP13]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX8]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
@@ -2350,24 +2350,24 @@ void foo_simd(int low, int up) {
 // CHECK3-IRBUILDER-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP8:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-IRBUILDER-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP8]] to i64
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[IDXPROM]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP11:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-IRBUILDER-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP11]] to i64
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[IDXPROM9]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM9]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[MUL11:%.*]] = fmul float [[TMP9]], [[TMP12]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-IRBUILDER-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM12]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM12]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX13]], align 4
 // CHECK3-IRBUILDER-NEXT:    [[MUL14:%.*]] = fmul float [[MUL11]], [[TMP15]]
 // CHECK3-IRBUILDER-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK3-IRBUILDER-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1
 // CHECK3-IRBUILDER-NEXT:    [[IDXPROM15:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM15]]
+// CHECK3-IRBUILDER-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM15]]
 // CHECK3-IRBUILDER-NEXT:    store float [[MUL14]], ptr [[ARRAYIDX16]], align 4
 // CHECK3-IRBUILDER-NEXT:    br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]]
 // CHECK3-IRBUILDER:       omp.inner.for.body.ordered.after:
@@ -2674,21 +2674,21 @@ void foo_simd(int low, int up) {
 // CHECK5:       for.body:
 // CHECK5-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP2:%.*]] = load i64, ptr [[I]], align 8
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 [[TMP2]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i64 [[TMP2]]
 // CHECK5-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK5-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP5:%.*]] = load i64, ptr [[I]], align 8
-// CHECK5-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 [[TMP5]]
+// CHECK5-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[TMP5]]
 // CHECK5-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
 // CHECK5-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], [[TMP6]]
 // CHECK5-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[I]], align 8
-// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i64 [[TMP8]]
+// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP8]]
 // CHECK5-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 // CHECK5-NEXT:    [[MUL3:%.*]] = fmul float [[MUL]], [[TMP9]]
 // CHECK5-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[I]], align 8
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP11]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[TMP11]]
 // CHECK5-NEXT:    store float [[MUL3]], ptr [[ARRAYIDX4]], align 4
 // CHECK5-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK5:       for.inc:
@@ -2804,24 +2804,24 @@ void foo_simd(int low, int up) {
 // CHECK5-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, ptr [[I]], align 1
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP3]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 [[IDXPROM]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i64 [[IDXPROM]]
 // CHECK5-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK5-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP6:%.*]] = load i8, ptr [[I]], align 1
 // CHECK5-NEXT:    [[IDXPROM4:%.*]] = zext i8 [[TMP6]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i64 [[IDXPROM4]]
+// CHECK5-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[IDXPROM4]]
 // CHECK5-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
 // CHECK5-NEXT:    [[MUL:%.*]] = fmul float [[TMP4]], [[TMP7]]
 // CHECK5-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8, ptr [[I]], align 1
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP9]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 [[IDXPROM6]]
+// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM6]]
 // CHECK5-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[MUL]], [[TMP10]]
 // CHECK5-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i8, ptr [[I]], align 1
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP12]] to i64
-// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[IDXPROM9]]
+// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM9]]
 // CHECK5-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK5-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK5:       for.inc:
diff --git a/clang/test/OpenMP/parallel_for_codegen.cpp b/clang/test/OpenMP/parallel_for_codegen.cpp
index 2dec32d71b91a..c7afae419509b 100644
--- a/clang/test/OpenMP/parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_codegen.cpp
@@ -665,24 +665,24 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -779,21 +779,21 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK1-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -882,21 +882,21 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK1-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -1159,24 +1159,24 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]]
 // CHECK1-NEXT:    store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -1303,7 +1303,7 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[CALL]] to float
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK1-NEXT:    [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1312,7 +1312,7 @@ void range_for_collapsed() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK1-NEXT:    [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]]
 // CHECK1-NEXT:    store float [[ADD9]], ptr [[ARRAYIDX8]], align 4
@@ -1781,24 +1781,24 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK2-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
 // CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK2-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK2-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -1895,21 +1895,21 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK2-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -1998,21 +1998,21 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK2-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -2275,24 +2275,24 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]]
+// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]]
 // CHECK2-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]]
+// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]]
 // CHECK2-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]]
+// CHECK2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]]
 // CHECK2-NEXT:    store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -2419,7 +2419,7 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    [[CONV:%.*]] = sitofp i32 [[CALL]] to float
 // CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]]
 // CHECK2-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK2-NEXT:    [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -2428,7 +2428,7 @@ void range_for_collapsed() {
 // CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
 // CHECK2-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]]
+// CHECK2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK2-NEXT:    [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]]
 // CHECK2-NEXT:    store float [[ADD9]], ptr [[ARRAYIDX8]], align 4
@@ -2897,24 +2897,24 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG45:![0-9]+]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64, !dbg [[DBG45]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]], !dbg [[DBG45]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64, !dbg [[DBG45]]
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]], !dbg [[DBG45]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64, !dbg [[DBG45]]
-// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]], !dbg [[DBG45]]
+// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]], !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64, !dbg [[DBG45]]
-// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]], !dbg [[DBG45]]
+// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]], !dbg [[DBG45]]
 // CHECK5-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !dbg [[DBG45]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG46:![0-9]+]]
 // CHECK5:       omp.body.continue:
@@ -3011,21 +3011,21 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !dbg [[DBG54]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG56:![0-9]+]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG56]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
-// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG56]]
+// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG56]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]], !dbg [[DBG56]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
-// CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG56]]
+// CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG56]]
 // CHECK5-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !dbg [[DBG56]], !llvm.access.group [[ACC_GRP55]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG57:![0-9]+]]
 // CHECK5:       omp.body.continue:
@@ -3114,21 +3114,21 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !dbg [[DBG66]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG68:![0-9]+]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG68]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
-// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG68]]
+// CHECK5-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
-// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG68]]
+// CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]], !dbg [[DBG68]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
-// CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG68]]
+// CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]], !dbg [[DBG68]]
 // CHECK5-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !dbg [[DBG68]], !llvm.access.group [[ACC_GRP67]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG69:![0-9]+]]
 // CHECK5:       omp.body.continue:
@@ -3391,24 +3391,24 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !dbg [[DBG97:![0-9]+]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64, !dbg [[DBG97]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]], !dbg [[DBG97]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64, !dbg [[DBG97]]
-// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]], !dbg [[DBG97]]
+// CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64, !dbg [[DBG97]]
-// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]], !dbg [[DBG97]]
+// CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]], !dbg [[DBG97]]
 // CHECK5-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64, !dbg [[DBG97]]
-// CHECK5-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]], !dbg [[DBG97]]
+// CHECK5-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]], !dbg [[DBG97]]
 // CHECK5-NEXT:    store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !dbg [[DBG97]], !llvm.access.group [[ACC_GRP95]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG98:![0-9]+]]
 // CHECK5:       omp.body.continue:
@@ -3535,7 +3535,7 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    [[CONV:%.*]] = sitofp i32 [[CALL]] to float, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64, !dbg [[DBG111]]
-// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]], !dbg [[DBG111]]
+// CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]], !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]], !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !dbg [[DBG111]]
@@ -3544,7 +3544,7 @@ void range_for_collapsed() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64, !dbg [[DBG111]]
-// CHECK5-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]], !dbg [[DBG111]]
+// CHECK5-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]], !dbg [[DBG111]]
 // CHECK5-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG111]]
 // CHECK5-NEXT:    [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]], !dbg [[DBG111]]
 // CHECK5-NEXT:    store float [[ADD9]], ptr [[ARRAYIDX8]], align 4, !dbg [[DBG111]]
@@ -4013,24 +4013,24 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8
 // CHECK6-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM]]
 // CHECK6-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK6-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK6-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
+// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM3]]
 // CHECK6-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK6-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
 // CHECK6-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK6-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
+// CHECK6-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[IDXPROM6]]
 // CHECK6-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
 // CHECK6-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
 // CHECK6-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK6-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
+// CHECK6-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP23]], i64 [[IDXPROM9]]
 // CHECK6-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4
 // CHECK6-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK6:       omp.body.continue:
@@ -4127,21 +4127,21 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK6-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK6-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK6-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK6-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK6-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK6-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK6-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP5]]
-// CHECK6-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK6-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK6-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK6-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK6:       omp.body.continue:
@@ -4230,21 +4230,21 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    store i64 [[ADD1]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP12:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 [[TMP12]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP12]]
 // CHECK6-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP15:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK6-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[TMP15]]
+// CHECK6-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[TMP15]]
 // CHECK6-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP16]]
 // CHECK6-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP18:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP18]]
+// CHECK6-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[TMP18]]
 // CHECK6-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP19]]
 // CHECK6-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    [[TMP21:%.*]] = load i64, ptr [[I]], align 8, !llvm.access.group [[ACC_GRP8]]
-// CHECK6-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP21]]
+// CHECK6-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP20]], i64 [[TMP21]]
 // CHECK6-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK6-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK6:       omp.body.continue:
@@ -4507,24 +4507,24 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP14:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[IDXPROM:%.*]] = zext i8 [[TMP14]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM]]
 // CHECK6-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP17:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[IDXPROM6:%.*]] = zext i8 [[TMP17]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM6]]
+// CHECK6-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM6]]
 // CHECK6-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP18]]
 // CHECK6-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP20:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[IDXPROM9:%.*]] = zext i8 [[TMP20]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM9]]
+// CHECK6-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP19]], i64 [[IDXPROM9]]
 // CHECK6-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[MUL11:%.*]] = fmul float [[MUL8]], [[TMP21]]
 // CHECK6-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[TMP23:%.*]] = load i8, ptr [[I]], align 1, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    [[IDXPROM12:%.*]] = zext i8 [[TMP23]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM12]]
+// CHECK6-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i64 [[IDXPROM12]]
 // CHECK6-NEXT:    store float [[MUL11]], ptr [[ARRAYIDX13]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK6-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK6:       omp.body.continue:
@@ -4649,7 +4649,7 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    [[CONV:%.*]] = sitofp i32 [[CALL]] to float
 // CHECK6-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP13]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[VLA1]], i64 [[IDXPROM]]
+// CHECK6-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[VLA1]], i64 [[IDXPROM]]
 // CHECK6-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 // CHECK6-NEXT:    [[ADD4:%.*]] = fadd float [[CONV]], [[TMP14]]
 // CHECK6-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -4658,7 +4658,7 @@ void range_for_collapsed() {
 // CHECK6-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK6-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
 // CHECK6-NEXT:    [[IDXPROM7:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK6-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM7]]
+// CHECK6-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM7]]
 // CHECK6-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
 // CHECK6-NEXT:    [[ADD9:%.*]] = fadd float [[TMP18]], [[ADD6]]
 // CHECK6-NEXT:    store float [[ADD9]], ptr [[ARRAYIDX8]], align 4
diff --git a/clang/test/OpenMP/parallel_for_linear_codegen.cpp b/clang/test/OpenMP/parallel_for_linear_codegen.cpp
index 8b46797ae253f..15eb0dfa42af5 100644
--- a/clang/test/OpenMP/parallel_for_linear_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_linear_codegen.cpp
@@ -337,7 +337,7 @@ int main() {
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP14]], [[MUL6]]
 // CHECK1-NEXT:    store i32 [[ADD7]], ptr [[LVAR3]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[PVAR2]], align 8
-// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
+// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP16]], i32 1
 // CHECK1-NEXT:    store ptr [[INCDEC_PTR]], ptr [[PVAR2]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[LVAR3]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP17]], 1
diff --git a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
index 752aec788bf34..59d169d7a1738 100644
--- a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
@@ -83,16 +83,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -122,7 +122,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -137,19 +137,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP36]], align 8
@@ -470,9 +470,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/parallel_for_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_scan_codegen.cpp
index 161534814a793..67b32407c712f 100644
--- a/clang/test/OpenMP/parallel_for_scan_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_scan_codegen.cpp
@@ -28,9 +28,9 @@ void baz(int n) {
 
   // CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(
   // CHECK: [[LAST:%.+]] = mul nsw i64 9, %
-  // CHECK: [[LAST_REF:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[LAST]]
+  // CHECK: [[LAST_REF:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[LAST]]
   // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 @_ZZ3baziE1a, ptr align 4 [[LAST_REF]], i64 %{{.+}}, i1 false)
-  // CHECK: [[LAST_REF_B:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 9
+  // CHECK: [[LAST_REF_B:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 9
   // CHECK: [[LAST_VAL:%.+]] = load double, ptr [[LAST_REF_B]],
   // CHECK: store double [[LAST_VAL]], ptr @_ZZ3baziE1b,
 
@@ -58,13 +58,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:.+]]
@@ -91,13 +91,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -151,13 +151,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE:[^,]+]]
@@ -188,13 +188,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
@@ -226,13 +226,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -289,13 +289,13 @@ void baz(int n) {
     // CHECK: [[IF_THEN]]:
     // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE]]
diff --git a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp
index 7e973a602a65c..cac997753d480 100644
--- a/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_simd_scan_codegen.cpp
@@ -51,13 +51,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:.+]]
@@ -84,13 +84,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -144,13 +144,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE:[^,]+]]
@@ -181,13 +181,13 @@ void baz(int n) {
     // CHECK: [[BASE_IDX_I:%.+]] = load i32, ptr [[IV_ADDR:%.+]],
     // CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF:%.+]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF:%.+]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_BUF_IDX]], ptr {{.*}}[[A_PRIV]], i64 [[BYTES]], i1 false)
 
     // b_buffer[i] = b_priv;
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF:%.+]], i64 [[BASE_IDX]]
     // CHECK: [[B_PRIV:%.+]] = load double, ptr [[B_PRIV_ADDR]],
     // CHECK: store double [[B_PRIV]], ptr [[B_BUF_IDX]],
     // CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
@@ -219,13 +219,13 @@ void baz(int n) {
 
     // a_buffer[i] += a_buffer[i-pow(2, k)];
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[I]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[IDX_SUB_K2POW]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[I]]
+    // CHECK: [[A_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[I]]
     // CHECK: [[IDX_SUB_K2POW:%.+]] = sub nuw i64 [[I]], [[K2POW]]
-    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
+    // CHECK: [[B_BUF_IDX_SUB_K2POW:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[IDX_SUB_K2POW]]
     // CHECK: [[A_BUF_END:%.+]] = getelementptr float, ptr [[A_BUF_IDX]], i64 [[NUM_ELEMS]]
     // CHECK: [[ISEMPTY:%.+]] = icmp eq ptr [[A_BUF_IDX]], [[A_BUF_END]]
     // CHECK: br i1 [[ISEMPTY]], label %[[RED_DONE:[^,]+]], label %[[RED_BODY:[^,]+]]
@@ -282,13 +282,13 @@ void baz(int n) {
     // CHECK: [[IF_THEN]]:
     // CHECK: [[BASE_IDX_SUB_1:%.+]] = sub nuw i64 [[BASE_IDX]], 1
     // CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX_SUB_1]], [[NUM_ELEMS]]
-    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, ptr [[A_BUF]], i64 [[IDX]]
-    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
+    // CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[A_BUF]], i64 [[IDX]]
+    // CHECK: [[A_PRIV:%.+]] = getelementptr inbounds nuw [10 x float], ptr [[A_PRIV_ADDR:%.+]], i64 0, i64 0
     // CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
     // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}[[A_PRIV]], ptr {{.*}}[[A_BUF_IDX]], i64 [[BYTES]], i1 false)
 
     // b_priv = b_buffer[i];
-    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
+    // CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds nuw double, ptr [[B_BUF]], i64 [[BASE_IDX_SUB_1]]
     // CHECK: [[B_BUF_IDX_VAL:%.+]] = load double, ptr [[B_BUF_IDX]],
     // CHECK: store double [[B_BUF_IDX_VAL]], ptr [[B_PRIV_ADDR]],
     // CHECK: br label %[[SCAN_PHASE]]
diff --git a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
index b17757a5f978a..c1fe00f238001 100644
--- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
@@ -72,16 +72,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -111,7 +111,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -126,19 +126,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -425,9 +425,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
index 3c74aaca3f46d..1d106922435d5 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
@@ -84,9 +84,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -138,10 +138,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0:%.+]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N:%.+]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
index 7d4216ddde6a3..9a524c3b94c6e 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
@@ -84,9 +84,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C:%.+]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -138,10 +138,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0:%.+]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N:%.+]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/parallel_reduction_codegen.cpp b/clang/test/OpenMP/parallel_reduction_codegen.cpp
index f49faa6b89deb..ce76429b871fe 100644
--- a/clang/test/OpenMP/parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_codegen.cpp
@@ -354,9 +354,9 @@ int main() {
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK1-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]]
@@ -1632,9 +1632,9 @@ int main() {
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0
 // CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0
+// CHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0
 // CHECK3-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK3-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK3-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]]
@@ -2142,9 +2142,9 @@ int main() {
 // CHECK4-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 0
+// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i64 0
 // CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[X_ADDR]], align 8
-// CHECK4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i64 0
+// CHECK4-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 0
 // CHECK4-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX1]] to i64
 // CHECK4-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK4-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP2]], [[TMP3]]
diff --git a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
index 208f7a41aa3db..40cc3103b1c0f 100644
--- a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
@@ -72,16 +72,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -111,7 +111,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -126,19 +126,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -416,9 +416,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
index 6d73652c3ea27..61597a074cf59 100644
--- a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
@@ -81,16 +81,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_SECTIONS_IL_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -120,7 +120,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -135,19 +135,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -458,9 +458,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 4d2b93ffd4712..a7db3da7d1f86 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -133,9 +133,9 @@ int main()
 // CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i64 0
 // CHECK-NEXT:    store double 0.000000e+00, ptr [[E2]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[E_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
@@ -529,16 +529,16 @@ int main()
 // CHECK1-NEXT:    store i64 9, ptr [[DOTOMP_UB]], align 8
 // CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_STRIDE]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0
 // CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY]], i64 2
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY]], i64 2
 // CHECK1-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX1]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY2]], i64 1
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY2]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYDECAY5:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX4]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY5]], i64 5
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY5]], i64 5
 // CHECK1-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX6]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY7]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY7]], i64 1
 // CHECK1-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK1-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARRAYIDX3]] to i64
 // CHECK1-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], [[TMP2]]
@@ -564,18 +564,18 @@ int main()
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = sdiv exact i64 [[TMP11]], ptrtoint (ptr getelementptr (double, ptr null, i32 1) to i64)
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[VLA]], i64 [[TMP12]]
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [1 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [1 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 0
 // CHECK1-NEXT:    [[ARRAYDECAY10:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX9]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY10]], i64 2
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY10]], i64 2
 // CHECK1-NEXT:    [[ARRAYDECAY12:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX11]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY12]], i64 1
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY12]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw [10 x [10 x [10 x double]]], ptr [[TMP0]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYDECAY15:%.*]] = getelementptr inbounds [10 x [10 x double]], ptr [[ARRAYIDX14]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYDECAY15]], i64 5
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw [10 x double], ptr [[ARRAYDECAY15]], i64 5
 // CHECK1-NEXT:    [[ARRAYDECAY17:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX16]], i64 0, i64 0
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY17]], i64 1
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw double, ptr [[ARRAYDECAY17]], i64 1
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP14]], align 8
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX13]], ptr [[TMP15]], align 8
@@ -852,7 +852,7 @@ int main()
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 // CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 0
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP9:%.*]] = mul nuw i32 [[TMP8]], 4
 // CHECK2-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
@@ -930,10 +930,10 @@ int main()
 // CHECK2-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[TMP48]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP48]], i32 0
 // CHECK2-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[INPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[TMP50]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP50]], i32 0
 // CHECK2-NEXT:    [[TMP51:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP52:%.*]] = mul nuw i32 [[TMP51]], 4
 // CHECK2-NEXT:    [[TMP53:%.*]] = sext i32 [[TMP52]] to i64
@@ -1007,7 +1007,7 @@ int main()
 // CHECK2-NEXT:    [[TMP86:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP86]], ptr [[SIZE_CASTED21]], align 4
 // CHECK2-NEXT:    [[TMP87:%.*]] = load i32, ptr [[SIZE_CASTED21]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[A]], i32 0, i32 0
+// CHECK2-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[A]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK2-NEXT:    store i32 [[TMP87]], ptr [[TMP88]], align 4
 // CHECK2-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
@@ -1480,9 +1480,9 @@ int main()
 // CHECK2-NEXT:    store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[INPUT]], ptr [[INPUT_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 0
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 2
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 2
 // CHECK2-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [3 x i32], ptr [[OUTPUT2]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 3
 // CHECK2-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP2]]
@@ -1664,9 +1664,9 @@ int main()
 // CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i32 0
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
+// CHECK2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 2
 // CHECK2-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [3 x i32], ptr [[OUTPUT4]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 3
 // CHECK2-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP8]]
@@ -1878,8 +1878,8 @@ int main()
 // CHECK2-NEXT:    store i32 [[SIZE]], ptr [[SIZE_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 0
-// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 1
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[TMP0]], i32 0, i32 0
+// CHECK2-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [10 x i32], ptr [[TMP0]], i32 0, i32 1
 // CHECK2-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[A2]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i32 2
 // CHECK2-NEXT:    [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
diff --git a/clang/test/OpenMP/sections_reduction_task_codegen.cpp b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
index 1a2cf7aede321..5d749eeb81776 100644
--- a/clang/test/OpenMP/sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
@@ -82,16 +82,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_SECTIONS_IL_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP5]]
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP6]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP6]], i64 9
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]]
@@ -121,7 +121,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP21]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP22]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP23]], align 8
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -136,19 +136,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP28]], align 8
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP29]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP34]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP35]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP35]], i64 9
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP36]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP36]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP30]], align 8
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP37]], align 8
@@ -463,9 +463,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp b/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp
index d912f801a33db..19a523ee165c8 100644
--- a/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp
+++ b/clang/test/OpenMP/target_data_use_device_addr_codegen.cpp
@@ -54,12 +54,12 @@ int main() {
 // CHECK: [[SIZES:%.+]] = alloca [6 x i64],
 // CHECK: [[VLA_ADDR:%.+]] = alloca float, i64 %{{.+}},
 // CHECK: [[PTR:%.+]] = load ptr, ptr [[PTR_ADDR]],
-// CHECK-NEXT: [[ARR_IDX:%.+]] = getelementptr inbounds float, ptr [[PTR]], i64 3
+// CHECK-NEXT: [[ARR_IDX:%.+]] = getelementptr inbounds nuw float, ptr [[PTR]], i64 3
 // CHECK: [[P5:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT: [[ARR_IDX1:%.+]] = getelementptr inbounds float, ptr [[P5]], i64 0
 // CHECK: [[P7:%.+]] = load ptr, ptr [[REF_ADDR]],
 // CHECK-NEXT: [[REF:%.+]] = load ptr, ptr [[REF_ADDR]],
-// CHECK-NEXT: [[ARR_IDX2:%.+]] = getelementptr inbounds [4 x float], ptr [[ARR_ADDR]], i64 0, i64 0
+// CHECK-NEXT: [[ARR_IDX2:%.+]] = getelementptr inbounds nuw [4 x float], ptr [[ARR_ADDR]], i64 0, i64 0
 // CHECK: [[P10:%.+]] = mul nuw i64 {{.+}}, 4
 // CHECK-NEXT: [[ARR_IDX5:%.+]] = getelementptr inbounds float, ptr [[VLA_ADDR]], i64 0
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[SIZES]], ptr align 8 [[SIZES1]], i64 48, i1 false)
@@ -132,14 +132,14 @@ int main() {
 // CHECK: [[SIZES:%.+]] = alloca [6 x i64],
 // CHECK: [[A_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS:%.+]], i32 0, i32 0
 // CHECK: [[PTR_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 1
-// CHECK: [[ARR_IDX:%.+]] = getelementptr inbounds i32, ptr %{{.+}}, i64 3
+// CHECK: [[ARR_IDX:%.+]] = getelementptr inbounds nuw i32, ptr %{{.+}}, i64 3
 // CHECK: [[REF_REF:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 2
 // CHECK: [[REF_PTR:%.+]] = load ptr, ptr [[REF_REF]],
 // CHECK-NEXT: [[P3:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 1
 // CHECK: [[ARR_IDX5:%.+]] = getelementptr inbounds i32, ptr {{.+}}, i64 0
 // CHECK: [[ARR_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 3
 
-// CHECK: [[ARR_IDX6:%.+]] = getelementptr inbounds [4 x i32], ptr [[ARR_ADDR]], i64 0, i64 0
+// CHECK: [[ARR_IDX6:%.+]] = getelementptr inbounds nuw [4 x i32], ptr [[ARR_ADDR]], i64 0, i64 0
 // CHECK: [[A_ADDR2:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 0
 // CHECK: [[P4:%.+]] = mul nuw i64 [[CONV:%.+]], 4
 // CHECK: [[A_ADDR3:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 0
@@ -147,7 +147,7 @@ int main() {
 // CHECK: [[L6:%.+]] = sext i32 [[L5]] to i64
 // CHECK: [[LB_ADD_LEN:%lb_add_len]] = add nsw i64 -1, [[L6]]
 // CHECK: [[ARR_ADDR9:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[THIS]], i32 0, i32 3
-// CHECK: [[ARR_IDX10:%arrayidx.+]] = getelementptr inbounds [4 x i32], ptr [[ARR_ADDR9]], i64 0, i64 %lb_add_len
+// CHECK: [[ARR_IDX10:%arrayidx.+]] = getelementptr inbounds nuw [4 x i32], ptr [[ARR_ADDR9]], i64 0, i64 %lb_add_len
 // CHECK: [[ARR_END:%.+]] = getelementptr i32, ptr [[ARR_IDX10]], i32 1
 // CHECK: [[E:%.+]] = ptrtoint ptr [[ARR_END]] to i64
 // CHECK: [[B:%.+]] = ptrtoint ptr [[A_ADDR]] to i64
diff --git a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
index c90819dc2a22f..6d1c0213d648c 100644
--- a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
@@ -49,14 +49,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds double, ptr [[TT]], i32 1
+  // CK1:     getelementptr inbounds nuw double, ptr [[TT]], i32 1
   #pragma omp target data map(g[:10]) use_device_ptr(g)
   {
     ++g;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE00]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds double, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw double, ptr [[TTT]], i32 1
   ++g;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -67,26 +67,26 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(l)
   {
     ++l;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE01]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l;
 
   // CK1-NOT: call void @__tgt_target
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(l) if(0)
   {
     ++l;
   }
   // CK1-NOT: call void @__tgt_target
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -97,14 +97,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(l) if(1)
   {
     ++l;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE03]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l;
 
   // CK1:     [[CMP:%.+]] = icmp ne ptr %{{.+}}, null
@@ -119,12 +119,12 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT1]], i32 1
   // CK1:     br label %[[BEND:.+]]
 
   // CK1:     [[BELSE]]:
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   // CK1:     br label %[[BEND]]
   #pragma omp target data map(l[:10]) use_device_ptr(l) if(lr != 0)
   {
@@ -142,7 +142,7 @@ void foo(float *&lr, T *&tr) {
 
   // CK1:     [[BEND]]:
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l;
 
   // CK1:     [[T2:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -156,7 +156,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     store ptr [[PVTV]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
   // CK1:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-  // CK1:     getelementptr inbounds float, ptr [[TT2]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT2]], i32 1
   #pragma omp target data map(lr[:10]) use_device_ptr(lr)
   {
     ++lr;
@@ -164,7 +164,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE05]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
   // CK1:     [[TTTT:%.+]] = load ptr, ptr [[TTT]],
-  // CK1:     getelementptr inbounds float, ptr [[TTTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTTT]], i32 1
   ++lr;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -175,14 +175,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT1]], i32 1
   #pragma omp target data map(t[:10]) use_device_ptr(t)
   {
     ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE06]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds i32, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTT]], i32 1
   ++t;
 
   // CK1:     [[T2:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -196,7 +196,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     store ptr [[PVTV]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
   // CK1:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT2]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT2]], i32 1
   #pragma omp target data map(tr[:10]) use_device_ptr(tr)
   {
     ++tr;
@@ -204,7 +204,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE07]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
   // CK1:     [[TTTT:%.+]] = load ptr, ptr [[TTT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TTTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTTT]], i32 1
   ++tr;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -215,14 +215,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l)
   {
     ++l; ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE08]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds float, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[TTT]], i32 1
   ++l; ++t;
 
 
@@ -232,18 +232,18 @@ void foo(float *&lr, T *&tr) {
   // CK1:     [[VAL:%.+]] = load ptr, ptr {{%.+}},
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[_TT1:%.+]] = load ptr, ptr [[_PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[_TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[_TT1]], i32 1
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l) use_device_ptr(t)
   {
     ++l; ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE09]]
   // CK1:     [[_TTT:%.+]] = load ptr, ptr {{%.+}},
-  // CK1:     getelementptr inbounds float, ptr [[_TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[_TTT]], i32 1
   // CK1:     [[TTT:%.+]] = load ptr, ptr {{%.+}},
-  // CK1:     getelementptr inbounds i32, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTT]], i32 1
   ++l; ++t;
 
   // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE10]]
@@ -252,18 +252,18 @@ void foo(float *&lr, T *&tr) {
   // CK1:     [[VAL:%.+]] = load ptr, ptr {{%.+}},
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[_TT1:%.+]] = load ptr, ptr [[_PVT]],
-  // CK1:     getelementptr inbounds float, ptr [[_TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[_TT1]], i32 1
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10], t[:10]) use_device_ptr(l,t)
   {
     ++l; ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE10]]
   // CK1:     [[_TTT:%.+]] = load ptr, ptr {{%.+}},
-  // CK1:     getelementptr inbounds float, ptr [[_TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw float, ptr [[_TTT]], i32 1
   // CK1:     [[TTT:%.+]] = load ptr, ptr {{%.+}},
-  // CK1:     getelementptr inbounds i32, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTT]], i32 1
   ++l; ++t;
 
   // CK1:     [[T1:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -274,14 +274,14 @@ void foo(float *&lr, T *&tr) {
   // CK1-NOT: store ptr [[VAL]], ptr [[DECL]],
   // CK1:     store ptr [[VAL]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT1]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT1]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(t)
   {
     ++l; ++t;
   }
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE11]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-  // CK1:     getelementptr inbounds i32, ptr [[TTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTT]], i32 1
   ++l; ++t;
 
   // CK1:     [[T2:%.+]] = load ptr, ptr [[DECL:%.+]],
@@ -295,7 +295,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     store ptr [[PVTV]], ptr [[PVT:%.+]],
   // CK1:     [[TT1:%.+]] = load ptr, ptr [[PVT]],
   // CK1:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-  // CK1:     getelementptr inbounds i32, ptr [[TT2]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TT2]], i32 1
   #pragma omp target data map(l[:10]) use_device_ptr(tr)
   {
     ++l; ++tr;
@@ -303,7 +303,7 @@ void foo(float *&lr, T *&tr) {
   // CK1:     call void @__tgt_target_data_end{{.+}}[[MTYPE12]]
   // CK1:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
   // CK1:     [[TTTT:%.+]] = load ptr, ptr [[TTT]],
-  // CK1:     getelementptr inbounds i32, ptr [[TTTT]], i32 1
+  // CK1:     getelementptr inbounds nuw i32, ptr [[TTTT]], i32 1
   ++l; ++tr;
 
 }
@@ -354,7 +354,7 @@ struct ST {
     // CK2:     store ptr [[PVT]], ptr [[PVT2:%.+]],
     // CK2:     [[TT1:%.+]] = load ptr, ptr [[PVT2]],
     // CK2:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-    // CK2:     getelementptr inbounds double, ptr [[TT2]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TT2]], i32 1
     #pragma omp target data map(a[:10]) use_device_ptr(a)
     {
       a++;
@@ -362,7 +362,7 @@ struct ST {
     // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE00]]
     // CK2:     [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0
     // CK2:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-    // CK2:     getelementptr inbounds double, ptr [[TTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TTT]], i32 1
     a++;
 
     // CK2:     [[BP:%.+]] = getelementptr inbounds [2 x ptr], ptr %{{.+}}, i32 0, i32 1
@@ -373,7 +373,7 @@ struct ST {
     // CK2:     store ptr [[PVT]], ptr [[PVT2:%.+]],
     // CK2:     [[TT1:%.+]] = load ptr, ptr [[PVT2]],
     // CK2:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-    // CK2:     getelementptr inbounds double, ptr [[TT2]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TT2]], i32 1
     #pragma omp target data map(b[:10]) use_device_ptr(b)
     {
       b++;
@@ -382,7 +382,7 @@ struct ST {
     // CK2:     [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %{{.+}}, i32 0, i32 1
     // CK2:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
     // CK2:     [[TTTT:%.+]] = load ptr, ptr [[TTT]],
-    // CK2:     getelementptr inbounds double, ptr [[TTTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TTTT]], i32 1
     b++;
 
     // CK2:     [[BP:%.+]] = getelementptr inbounds [3 x ptr], ptr %{{.+}}, i32 0, i32 2
@@ -393,7 +393,7 @@ struct ST {
     // CK2:     store ptr [[PVT]], ptr [[PVT2:%.+]],
     // CK2:     [[TT1:%.+]] = load ptr, ptr [[PVT2]],
     // CK2:     [[TT2:%.+]] = load ptr, ptr [[TT1]],
-    // CK2:     getelementptr inbounds double, ptr [[TT2]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TT2]], i32 1
     #pragma omp target data map(la[:10]) use_device_ptr(a)
     {
       a++;
@@ -402,7 +402,7 @@ struct ST {
     // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE02]]
     // CK2:     [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0
     // CK2:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-    // CK2:     getelementptr inbounds double, ptr [[TTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TTT]], i32 1
     a++;
     la++;
 
@@ -419,10 +419,10 @@ struct ST {
     // CK2:     store ptr [[PVT1]], ptr [[_PVT1:%.+]],
     // CK2:     [[TT2:%.+]] = load ptr, ptr [[_PVT2]],
     // CK2:     [[_TT2:%.+]] = load ptr, ptr [[TT2]],
-    // CK2:     getelementptr inbounds double, ptr [[_TT2]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[_TT2]], i32 1
     // CK2:     [[TT1:%.+]] = load ptr, ptr [[_PVT1]],
     // CK2:     [[_TT1:%.+]] = load ptr, ptr [[TT1]],
-    // CK2:     getelementptr inbounds double, ptr [[_TT1]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[_TT1]], i32 1
     #pragma omp target data map(b[:10]) use_device_ptr(a, b)
     {
       a++;
@@ -431,11 +431,11 @@ struct ST {
     // CK2:     call void @__tgt_target_data_end{{.+}}[[MTYPE03]]
     // CK2:     [[DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 0
     // CK2:     [[TTT:%.+]] = load ptr, ptr [[DECL]],
-    // CK2:     getelementptr inbounds double, ptr [[TTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[TTT]], i32 1
     // CK2:     [[_DECL:%.+]] = getelementptr inbounds nuw [[ST]], ptr %this1, i32 0, i32 1
     // CK2:     [[_TTT:%.+]] = load ptr, ptr [[_DECL]],
     // CK2:     [[_TTTT:%.+]] = load ptr, ptr [[_TTT]],
-    // CK2:     getelementptr inbounds double, ptr [[_TTTT]], i32 1
+    // CK2:     getelementptr inbounds nuw double, ptr [[_TTTT]], i32 1
     a++;
     b++;
   }
diff --git a/clang/test/OpenMP/target_has_device_addr_codegen.cpp b/clang/test/OpenMP/target_has_device_addr_codegen.cpp
index 08bcc87ca5f0a..39eaedb0e48d1 100644
--- a/clang/test/OpenMP/target_has_device_addr_codegen.cpp
+++ b/clang/test/OpenMP/target_has_device_addr_codegen.cpp
@@ -586,7 +586,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[K]], ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -601,7 +601,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1079,7 +1079,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[K]], ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1094,7 +1094,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1133,7 +1133,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[K]], ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[K_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1148,7 +1148,7 @@ void use_template() {
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 1
+// CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP2]], i32 1
 // CHECK-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -1422,14 +1422,14 @@ void use_template() {
 // SIMD-ONLY0-NEXT:    store ptr [[K]], ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[AA]], ptr [[RAA]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[K]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8
 // SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[AA]], i64 0, i64 0
 // SIMD-ONLY0-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4
@@ -1478,14 +1478,14 @@ void use_template() {
 // SIMD-ONLY0-NEXT:    store ptr [[TMP0]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[K]], ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[K]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8
 // SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[AA]], i64 0, i64 0
 // SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
@@ -1520,14 +1520,14 @@ void use_template() {
 // SIMD-ONLY0-NEXT:    store ptr [[TMP0]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[K]], ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[K]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR]], ptr [[K]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[Z]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY0-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 1
+// SIMD-ONLY0-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i32 1
 // SIMD-ONLY0-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP4]], align 8
 // SIMD-ONLY0-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[AA]], i64 0, i64 0
 // SIMD-ONLY0-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
diff --git a/clang/test/OpenMP/target_in_reduction_codegen.cpp b/clang/test/OpenMP/target_in_reduction_codegen.cpp
index fb715e2de2a59..56191ee575136 100644
--- a/clang/test/OpenMP/target_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_in_reduction_codegen.cpp
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -85,7 +85,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP12]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP14]], align 8
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -100,7 +100,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP21]], align 8
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP22]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -118,7 +118,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP35:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP35]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP36]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -133,7 +133,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP43]], align 8
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP44]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP46]], align 8
 // CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
index 3a1c168533c37..505c34e21733c 100644
--- a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
@@ -2142,7 +2142,7 @@ void bar() {
 // CK10-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
 // CK10-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2153,7 +2153,7 @@ void bar() {
 // CK10-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
 // CK10-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2164,7 +2164,7 @@ void bar() {
 // CK10-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CK10-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2178,7 +2178,7 @@ void bar() {
 // CK10-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2192,7 +2192,7 @@ void bar() {
 // CK10-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2206,7 +2206,7 @@ void bar() {
 // CK10-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2224,11 +2224,11 @@ void bar() {
 // CK10-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8
 // CK10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK10-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
 // CK10-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
-// CK10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1
 // CK10-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8
 // CK10-NEXT:    ret void
 //
@@ -2613,7 +2613,7 @@ void bar() {
 // CK11-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
 // CK11-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2624,7 +2624,7 @@ void bar() {
 // CK11-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
 // CK11-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2635,7 +2635,7 @@ void bar() {
 // CK11-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CK11-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2649,7 +2649,7 @@ void bar() {
 // CK11-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2663,7 +2663,7 @@ void bar() {
 // CK11-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2677,7 +2677,7 @@ void bar() {
 // CK11-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK11-NEXT:    ret void
 //
@@ -2695,11 +2695,11 @@ void bar() {
 // CK11-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 8
 // CK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 8
 // CK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 8
 // CK11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 8
 // CK11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
-// CK11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1
 // CK11-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 8
 // CK11-NEXT:    ret void
 //
@@ -3084,7 +3084,7 @@ void bar() {
 // CK12-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
 // CK12-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3095,7 +3095,7 @@ void bar() {
 // CK12-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 4
 // CK12-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3106,7 +3106,7 @@ void bar() {
 // CK12-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 4
 // CK12-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3120,7 +3120,7 @@ void bar() {
 // CK12-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3134,7 +3134,7 @@ void bar() {
 // CK12-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3148,7 +3148,7 @@ void bar() {
 // CK12-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3166,11 +3166,11 @@ void bar() {
 // CK12-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4
 // CK12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4
 // CK12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
-// CK12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1
 // CK12-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4
 // CK12-NEXT:    ret void
 //
@@ -3555,7 +3555,7 @@ void bar() {
 // CK13-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
 // CK13-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[G_ADDR]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[G_ADDR]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3566,7 +3566,7 @@ void bar() {
 // CK13-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 4
 // CK13-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[L_ADDR]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP0]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[L_ADDR]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3577,7 +3577,7 @@ void bar() {
 // CK13-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 4
 // CK13-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[T_ADDR]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3591,7 +3591,7 @@ void bar() {
 // CK13-NEXT:    store ptr [[LR_ADDR]], ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3605,7 +3605,7 @@ void bar() {
 // CK13-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3619,7 +3619,7 @@ void bar() {
 // CK13-NEXT:    store ptr [[TR_ADDR]], ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3637,11 +3637,11 @@ void bar() {
 // CK13-NEXT:    store ptr [[LR_ADDR]], ptr [[_TMP1]], align 4
 // CK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[TMP]], align 4
 // CK13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP0]], align 4
 // CK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_TMP1]], align 4
 // CK13-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
-// CK13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 1
+// CK13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 1
 // CK13-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP2]], align 4
 // CK13-NEXT:    ret void
 //
@@ -3674,34 +3674,34 @@ void bar() {
 // SIMD-ONLY00-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 8
 // SIMD-ONLY00-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 8
@@ -3711,11 +3711,11 @@ void bar() {
 // SIMD-ONLY00-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 8
 // SIMD-ONLY00-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8
-// SIMD-ONLY00-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY00-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1
 // SIMD-ONLY00-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 8
 // SIMD-ONLY00-NEXT:    ret void
 //
@@ -3748,34 +3748,34 @@ void bar() {
 // SIMD-ONLY01-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 8
 // SIMD-ONLY01-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 8
@@ -3785,11 +3785,11 @@ void bar() {
 // SIMD-ONLY01-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 8
 // SIMD-ONLY01-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8
-// SIMD-ONLY01-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY01-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1
 // SIMD-ONLY01-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 8
 // SIMD-ONLY01-NEXT:    ret void
 //
@@ -3822,34 +3822,34 @@ void bar() {
 // SIMD-ONLY02-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 4
 // SIMD-ONLY02-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 4
@@ -3859,11 +3859,11 @@ void bar() {
 // SIMD-ONLY02-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 4
 // SIMD-ONLY02-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 4
-// SIMD-ONLY02-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY02-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1
 // SIMD-ONLY02-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 4
 // SIMD-ONLY02-NEXT:    ret void
 //
@@ -3896,34 +3896,34 @@ void bar() {
 // SIMD-ONLY03-NEXT:    store ptr [[LR]], ptr [[LR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TR]], ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP0:%.*]] = load ptr, ptr @g, align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR]], ptr @g, align 4
 // SIMD-ONLY03-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[L]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[T]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[T]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR3]], ptr [[TMP5]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TMP7]], ptr [[_TMP4]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[_TMP4]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR5]], ptr [[TMP9]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TMP11]], ptr [[_TMP6]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR7]], ptr [[TMP13]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    store ptr [[TMP15]], ptr [[_TMP8]], align 4
@@ -3933,11 +3933,11 @@ void bar() {
 // SIMD-ONLY03-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[LR_ADDR]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP8]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR10]], ptr [[TMP19]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[_TMP9]], align 4
 // SIMD-ONLY03-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 4
-// SIMD-ONLY03-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 1
+// SIMD-ONLY03-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP22]], i32 1
 // SIMD-ONLY03-NEXT:    store ptr [[INCDEC_PTR11]], ptr [[TMP21]], align 4
 // SIMD-ONLY03-NEXT:    ret void
 //
@@ -3951,7 +3951,7 @@ void bar() {
 // CK20-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // CK20-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
-// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
 // CK20-NEXT:    ret void
 //
@@ -4185,7 +4185,7 @@ void bar() {
 // CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CK20-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
-// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // CK20-NEXT:    ret void
 //
@@ -4199,7 +4199,7 @@ void bar() {
 // CK20-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
 // CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
 // CK20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CK20-NEXT:    ret void
 //
@@ -4212,12 +4212,12 @@ void bar() {
 // CK20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CK20-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
-// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // CK20-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
 // CK20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 8
 // CK20-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
-// CK20-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK20-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // CK20-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8
 // CK20-NEXT:    ret void
 //
@@ -4231,7 +4231,7 @@ void bar() {
 // CK21-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // CK21-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
-// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
 // CK21-NEXT:    ret void
 //
@@ -4465,7 +4465,7 @@ void bar() {
 // CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CK21-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
-// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // CK21-NEXT:    ret void
 //
@@ -4479,7 +4479,7 @@ void bar() {
 // CK21-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
 // CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
 // CK21-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 8
 // CK21-NEXT:    ret void
 //
@@ -4492,12 +4492,12 @@ void bar() {
 // CK21-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CK21-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK21-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
-// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // CK21-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
 // CK21-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 8
 // CK21-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
-// CK21-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK21-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // CK21-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 8
 // CK21-NEXT:    ret void
 //
@@ -4511,7 +4511,7 @@ void bar() {
 // CK22-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // CK22-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
-// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
 // CK22-NEXT:    ret void
 //
@@ -4745,7 +4745,7 @@ void bar() {
 // CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CK22-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
-// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // CK22-NEXT:    ret void
 //
@@ -4759,7 +4759,7 @@ void bar() {
 // CK22-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
 // CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
 // CK22-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4
 // CK22-NEXT:    ret void
 //
@@ -4772,12 +4772,12 @@ void bar() {
 // CK22-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CK22-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK22-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
-// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // CK22-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
 // CK22-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 4
 // CK22-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
-// CK22-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK22-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // CK22-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4
 // CK22-NEXT:    ret void
 //
@@ -4791,7 +4791,7 @@ void bar() {
 // CK23-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // CK23-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
-// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
 // CK23-NEXT:    ret void
 //
@@ -5025,7 +5025,7 @@ void bar() {
 // CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CK23-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
-// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // CK23-NEXT:    ret void
 //
@@ -5039,7 +5039,7 @@ void bar() {
 // CK23-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 1
 // CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
 // CK23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[TMP1]], align 4
 // CK23-NEXT:    ret void
 //
@@ -5052,12 +5052,12 @@ void bar() {
 // CK23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CK23-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[TMP0]], i32 0, i32 0
 // CK23-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
-// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP1]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // CK23-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[TMP0]], i32 0, i32 1
 // CK23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B]], align 4
 // CK23-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4
-// CK23-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// CK23-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // CK23-NEXT:    store ptr [[INCDEC_PTR1]], ptr [[TMP2]], align 4
 // CK23-NEXT:    ret void
 //
@@ -5071,7 +5071,7 @@ void bar() {
 // SIMD-ONLY10-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // SIMD-ONLY10-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // SIMD-ONLY10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
 // SIMD-ONLY10-NEXT:    ret void
 //
@@ -5101,21 +5101,21 @@ void bar() {
 // SIMD-ONLY10-NEXT:    store ptr null, ptr [[LA]], align 8
 // SIMD-ONLY10-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY10-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // SIMD-ONLY10-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY10-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
 // SIMD-ONLY10-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 8
 // SIMD-ONLY10-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY10-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 8
 // SIMD-ONLY10-NEXT:    [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY10-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 8
 // SIMD-ONLY10-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY10-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY10-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1
 // SIMD-ONLY10-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 8
 // SIMD-ONLY10-NEXT:    ret void
 //
@@ -5145,7 +5145,7 @@ void bar() {
 // SIMD-ONLY11-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // SIMD-ONLY11-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 8 dereferenceable(16) [[A]], ptr noundef nonnull align 8 dereferenceable(8) [[ARG_ADDR]])
 // SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 8
 // SIMD-ONLY11-NEXT:    ret void
 //
@@ -5175,21 +5175,21 @@ void bar() {
 // SIMD-ONLY11-NEXT:    store ptr null, ptr [[LA]], align 8
 // SIMD-ONLY11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 8
 // SIMD-ONLY11-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 8
 // SIMD-ONLY11-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 8
 // SIMD-ONLY11-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY11-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 8
 // SIMD-ONLY11-NEXT:    [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 8
 // SIMD-ONLY11-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
-// SIMD-ONLY11-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY11-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1
 // SIMD-ONLY11-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 8
 // SIMD-ONLY11-NEXT:    ret void
 //
@@ -5219,7 +5219,7 @@ void bar() {
 // SIMD-ONLY12-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // SIMD-ONLY12-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
 // SIMD-ONLY12-NEXT:    ret void
 //
@@ -5249,21 +5249,21 @@ void bar() {
 // SIMD-ONLY12-NEXT:    store ptr null, ptr [[LA]], align 4
 // SIMD-ONLY12-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY12-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // SIMD-ONLY12-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY12-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
 // SIMD-ONLY12-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 4
 // SIMD-ONLY12-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY12-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 4
 // SIMD-ONLY12-NEXT:    [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY12-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 4
 // SIMD-ONLY12-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
-// SIMD-ONLY12-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY12-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1
 // SIMD-ONLY12-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 4
 // SIMD-ONLY12-NEXT:    ret void
 //
@@ -5293,7 +5293,7 @@ void bar() {
 // SIMD-ONLY13-NEXT:    call void @_ZN2STIdEC1ERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // SIMD-ONLY13-NEXT:    call void @_ZN2STIdE3fooERPd(ptr noundef nonnull align 4 dereferenceable(8) [[A]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG_ADDR]])
 // SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARG_ADDR]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[ARG_ADDR]], align 4
 // SIMD-ONLY13-NEXT:    ret void
 //
@@ -5323,21 +5323,21 @@ void bar() {
 // SIMD-ONLY13-NEXT:    store ptr null, ptr [[LA]], align 4
 // SIMD-ONLY13-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP0]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR]], ptr [[A]], align 4
 // SIMD-ONLY13-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY13-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B]], align 4
 // SIMD-ONLY13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR2]], ptr [[TMP1]], align 4
 // SIMD-ONLY13-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 0
 // SIMD-ONLY13-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A3]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP3]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR4]], ptr [[A3]], align 4
 // SIMD-ONLY13-NEXT:    [[B5:%.*]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[THIS1]], i32 0, i32 1
 // SIMD-ONLY13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[B5]], align 4
 // SIMD-ONLY13-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
-// SIMD-ONLY13-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 1
+// SIMD-ONLY13-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 1
 // SIMD-ONLY13-NEXT:    store ptr [[INCDEC_PTR6]], ptr [[TMP4]], align 4
 // SIMD-ONLY13-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
index fcaceac7d3467..87fa7fe462daa 100644
--- a/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
+++ b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
@@ -45,7 +45,7 @@ void foo() {
 // CHECK-NEXT:    store ptr [[CALL]], ptr [[PTR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[PTR]], ptr [[TMP2]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
diff --git a/clang/test/OpenMP/target_map_codegen_01.cpp b/clang/test/OpenMP/target_map_codegen_01.cpp
index d112500eb5fdd..9f3553d2377cb 100644
--- a/clang/test/OpenMP/target_map_codegen_01.cpp
+++ b/clang/test/OpenMP/target_map_codegen_01.cpp
@@ -108,6 +108,6 @@ void implicit_maps_reference (int a, int *b){
 // CK2: store ptr [[ADDR]], ptr [[REF]],
 // CK2: [[T:%.+]] = load ptr, ptr [[REF]],
 // CK2: [[TT:%.+]] = load ptr, ptr [[T]],
-// CK2: getelementptr inbounds i32, ptr [[TT]], i32 1
+// CK2: getelementptr inbounds nuw i32, ptr [[TT]], i32 1
 #endif // CK2
 #endif
diff --git a/clang/test/OpenMP/target_map_codegen_21.cpp b/clang/test/OpenMP/target_map_codegen_21.cpp
index a1419b7d4beb8..f5c517692d8c8 100644
--- a/clang/test/OpenMP/target_map_codegen_21.cpp
+++ b/clang/test/OpenMP/target_map_codegen_21.cpp
@@ -185,7 +185,7 @@ int explicit_maps_globals(void){
 // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: store ptr @c, ptr [[BP0]]
-// CK22-DAG: store ptr getelementptr inbounds ([100 x i32], ptr @c, i{{.+}} 0, i{{.+}} 1), ptr [[P0]]
+// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x i32], ptr @c, i{{.+}} 0, i{{.+}} 1), ptr [[P0]]
 
 // CK22: call void [[CALL03:@.+]](ptr {{[^,]+}})
 #pragma omp target map(c [1:4])
@@ -277,7 +277,7 @@ int explicit_maps_globals(void){
 // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: store ptr @sc, ptr [[BP0]]
-// CK22-DAG: store ptr getelementptr inbounds ([100 x [[ST]]], ptr @sc, i{{.+}} 0, i{{.+}} 1), ptr [[P0]]
+// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x [[ST]]], ptr @sc, i{{.+}} 0, i{{.+}} 1), ptr [[P0]]
 
 // CK22: call void [[CALL08:@.+]](ptr {{[^,]+}})
 #pragma omp target map(sc [1:4])
@@ -369,7 +369,7 @@ int explicit_maps_globals(void){
 // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
 // CK22-DAG: store ptr @stc, ptr [[BP0]]
-// CK22-DAG: store ptr getelementptr inbounds ([100 x [[STT]]], ptr @stc, i{{.+}} 0, i{{.+}} 1),  ptr [[P0]]
+// CK22-DAG: store ptr getelementptr inbounds nuw ([100 x [[STT]]], ptr @stc, i{{.+}} 0, i{{.+}} 1),  ptr [[P0]]
 
 // CK22: call void [[CALL13:@.+]](ptr {{[^,]+}})
 #pragma omp target map(stc [1:4])
diff --git a/clang/test/OpenMP/target_map_codegen_27.cpp b/clang/test/OpenMP/target_map_codegen_27.cpp
index fe7ae12e00d13..bfe75bca481be 100644
--- a/clang/test/OpenMP/target_map_codegen_27.cpp
+++ b/clang/test/OpenMP/target_map_codegen_27.cpp
@@ -82,7 +82,7 @@ void explicit_maps_pointer_references (int *p){
 // CK28-DAG: store ptr [[VAR1:%.+]], ptr [[P0]]
 // CK28-DAG: [[VAR0]] = load ptr, ptr [[VAR00:%.+]],
 // CK28-DAG: [[VAR00]] = load ptr, ptr [[VAR000:%.+]],
-// CK28-DAG: [[VAR1]] = getelementptr inbounds i32, ptr [[VAR11:%.+]], i{{64|32}} 2
+// CK28-DAG: [[VAR1]] = getelementptr inbounds nuw i32, ptr [[VAR11:%.+]], i{{64|32}} 2
 // CK28-DAG: [[VAR11]] = load ptr, ptr [[VAR111:%.+]],
 // CK28-DAG: [[VAR111]] = load ptr, ptr [[VAR1111:%.+]],
 
diff --git a/clang/test/OpenMP/target_map_codegen_28.cpp b/clang/test/OpenMP/target_map_codegen_28.cpp
index e92f7e4773ecf..67ea72d791d03 100644
--- a/clang/test/OpenMP/target_map_codegen_28.cpp
+++ b/clang/test/OpenMP/target_map_codegen_28.cpp
@@ -89,7 +89,7 @@ struct SSB{
 // CK29-DAG: store ptr [[VAR1:%.+]], ptr [[BP2]]
 // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]]
 // CK29-DAG: [[VAR1]] = getelementptr inbounds nuw [[SSA]], ptr %{{.+}}, i32 0, i32 1
-// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0
+// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0
 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}},
 
 // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}})
@@ -129,7 +129,7 @@ struct SSB{
 // CK29-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
 // CK29-DAG: store ptr [[VAR1]], ptr [[BP2]]
 // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]]
-// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0
+// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0
 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}},
 
 // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}})
@@ -164,7 +164,7 @@ struct SSB{
 // CK29-DAG: store ptr [[VAR1:%.+]], ptr [[BP2]]
 // CK29-DAG: store ptr [[VAR2:%.+]], ptr [[P2]]
 // CK29-DAG: [[VAR1]] = getelementptr inbounds nuw [[SSA]], ptr %{{.+}}, i32 0, i32 1
-// CK29-DAG: [[VAR2]] = getelementptr inbounds double, ptr [[VAR22:%.+]], i{{.+}} 0
+// CK29-DAG: [[VAR2]] = getelementptr inbounds nuw double, ptr [[VAR22:%.+]], i{{.+}} 0
 // CK29-DAG: [[VAR22]] = load ptr, ptr %{{.+}},
 
 // CK29: call void [[CALL00:@.+]](ptr {{[^,]+}})
diff --git a/clang/test/OpenMP/target_map_codegen_29.cpp b/clang/test/OpenMP/target_map_codegen_29.cpp
index 936a01573c2d2..3ca7b228d26c2 100644
--- a/clang/test/OpenMP/target_map_codegen_29.cpp
+++ b/clang/test/OpenMP/target_map_codegen_29.cpp
@@ -89,7 +89,7 @@ typedef struct StructWithPtrTag : public Base {
 // CK30-DAG: [[PTR:%.+]] = getelementptr inbounds [4 x ptr], ptr [[PTRS]], i32 0, i32 2
 // CK30-DAG: store ptr [[S_PTR1_BEGIN:%.+]], ptr [[PTR]],
 // CK30-DAG: [[S_PTR1]] = getelementptr inbounds nuw [[STRUCT]], ptr [[S]], i32 0, i32 4
-// CK30-DAG: [[S_PTR1_BEGIN]] = getelementptr inbounds i32, ptr [[S_PTR1_BEGIN_REF:%.+]], i{{64|32}} 0
+// CK30-DAG: [[S_PTR1_BEGIN]] = getelementptr inbounds nuw i32, ptr [[S_PTR1_BEGIN_REF:%.+]], i{{64|32}} 0
 // CK30-DAG: [[S_PTR1_BEGIN_REF]] = load ptr, ptr [[S_PTR1:%.+]],
 // CK30-DAG: [[S_PTR1]] = getelementptr inbounds nuw [[STRUCT]], ptr [[S]], i32 0, i32 4
 
@@ -98,7 +98,7 @@ typedef struct StructWithPtrTag : public Base {
 // CK30-DAG: [[PTR:%.+]] = getelementptr inbounds [4 x ptr], ptr [[PTRS]], i32 0, i32 3
 // CK30-DAG: store ptr [[S_PTRBASE1_BEGIN:%.+]], ptr [[PTR]],
 // CK30-DAG: [[S_PTRBASE1]] = getelementptr inbounds nuw [[BASE]], ptr [[S_BASE:%.+]], i32 0, i32 2
-// CK30-DAG: [[S_PTRBASE1_BEGIN]] = getelementptr inbounds i32, ptr [[S_PTRBASE1_BEGIN_REF:%.+]], i{{64|32}} 0
+// CK30-DAG: [[S_PTRBASE1_BEGIN]] = getelementptr inbounds nuw i32, ptr [[S_PTRBASE1_BEGIN_REF:%.+]], i{{64|32}} 0
 // CK30-DAG: [[S_PTRBASE1_BEGIN_REF]] = load ptr, ptr [[S_PTRBASE1:%.+]],
 // CK30-DAG: [[S_PTRBASE1]] = getelementptr inbounds nuw [[BASE]], ptr [[S_BASE:%.+]], i32 0, i32 2
 void map_with_deep_copy() {
diff --git a/clang/test/OpenMP/target_map_deref_array_codegen.cpp b/clang/test/OpenMP/target_map_deref_array_codegen.cpp
index 9d395b0ab8cd8..e61fc7296332b 100644
--- a/clang/test/OpenMP/target_map_deref_array_codegen.cpp
+++ b/clang/test/OpenMP/target_map_deref_array_codegen.cpp
@@ -75,7 +75,7 @@ void foo(int **t1d)
 // CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[T1D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[T1D_ADDR]], align 8
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i64 0
 // CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[TMP7]], ptr [[TMP11]], align 8
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
diff --git a/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp
index 7a0da002fb944..692e3a4214c9d 100644
--- a/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp
+++ b/clang/test/OpenMP/target_map_member_expr_array_section_codegen.cpp
@@ -28,12 +28,12 @@ struct maptest {
     // CHECK: getelementptr inbounds
     // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS:%.+]], i32 0, i32 0
     // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0
-    // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0
+    // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0
 
     // SZ = &this->s.data[6]-&this->s.data[0]
     // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS]], i32 0, i32 0
     // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0
-    // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5
+    // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5
     // CHECK: [[S_DATA_6_ADDR:%.+]] = getelementptr float, ptr [[S_DATA_5_ADDR]], i32 1
     // CHECK: [[END_BC:%.+]] = ptrtoint ptr [[S_DATA_6_ADDR]] to i64
     // CHECK: [[BEG_BC:%.+]] = ptrtoint ptr [[S_DATA_0_ADDR]] to i64
@@ -64,12 +64,12 @@ struct maptest {
     // CHECK: [[SIZE:%.+]] = alloca [2 x i64],
     // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS:%.+]], i32 0, i32 0
     // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0
-    // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0
+    // CHECK: [[S_DATA_0_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 0
 
     // SZ = &this->s.data[6]-&this->s.data[0]
     // CHECK: [[S_ADDR:%.+]] = getelementptr inbounds nuw %struct.maptest, ptr [[THIS]], i32 0, i32 0
     // CHECK: [[S_DATA_ADDR:%.+]] = getelementptr inbounds nuw %struct.S, ptr [[S_ADDR]], i32 0, i32 0
-    // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5
+    // CHECK: [[S_DATA_5_ADDR:%.+]] = getelementptr inbounds nuw [6 x float], ptr [[S_DATA_ADDR]], i64 0, i64 5
     // CHECK: [[S_DATA_6_ADDR:%.+]] = getelementptr float, ptr [[S_DATA_5_ADDR]], i32 1
     // CHECK: [[END_BC:%.+]] = ptrtoint ptr [[S_DATA_6_ADDR]] to i64
     // CHECK: [[BEG_BC:%.+]] = ptrtoint ptr [[S_DATA_0_ADDR]] to i64
diff --git a/clang/test/OpenMP/target_map_member_expr_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
index 9b64647928a24..fb36ba7b78d5b 100644
--- a/clang/test/OpenMP/target_map_member_expr_codegen.cpp
+++ b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
@@ -223,7 +223,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[A4:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP10]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[A4]], align 8
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 0
 // CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ASIZE]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[TMP12]] to i64
 // CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[CONV]], 4
@@ -233,7 +233,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[D_ADDR]], align 8
 // CHECK-NEXT:    [[C5:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP16]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[C5]], align 8
-// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 0
 // CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[CSIZE]], align 4
 // CHECK-NEXT:    [[CONV7:%.*]] = zext i32 [[TMP18]] to i64
 // CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[CONV7]], 4
@@ -343,7 +343,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP79:%.*]] = load ptr, ptr [[_TMP12]], align 8
 // CHECK-NEXT:    [[C15:%.*]] = getelementptr inbounds nuw [[STRUCT_DESCRIPTOR]], ptr [[TMP79]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP80:%.*]] = load ptr, ptr [[C15]], align 8
-// CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i64 0
+// CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP80]], i64 0
 // CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[CSIZE]], align 4
 // CHECK-NEXT:    [[CONV17:%.*]] = zext i32 [[TMP81]] to i64
 // CHECK-NEXT:    [[TMP82:%.*]] = mul nuw i64 [[CONV17]], 4
diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp b/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp
index ffb145d8e50fc..775f0b296b1b6 100644
--- a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp
+++ b/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp
@@ -45,7 +45,7 @@ void foo() {
 // CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0
 // CHECK-NEXT:    store i32 222, ptr [[A]], align 4
-// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [10 x %struct.D], ptr [[SA]], i64 0, i64 0
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[SA]], ptr [[TMP0]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
diff --git a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
index 3d0710acf0ee7..5cce677e88572 100644
--- a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
@@ -96,16 +96,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -135,7 +135,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -150,19 +150,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP36]], align 8
@@ -483,9 +483,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
index 28d63dbf8c4a9..c0bb4a6d6cc82 100644
--- a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
@@ -85,16 +85,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -139,19 +139,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -429,9 +429,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/target_task_affinity_codegen.cpp b/clang/test/OpenMP/target_task_affinity_codegen.cpp
index 85c5d63a6cd9c..53960cee4b730 100644
--- a/clang/test/OpenMP/target_task_affinity_codegen.cpp
+++ b/clang/test/OpenMP/target_task_affinity_codegen.cpp
@@ -76,7 +76,7 @@ int main() {
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP3]], align 8
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -102,7 +102,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[B]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[B]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i64 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[TMP17]], ptr [[TMP19]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
@@ -174,9 +174,9 @@ int main() {
 // CHECK1-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 8, ptr @.omp_task_entry.)
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i64 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1023
+// CHECK1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 1023
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i64
@@ -299,7 +299,7 @@ int main() {
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A]], align 4
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i32 0
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP3]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
@@ -325,7 +325,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[B]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[B]], align 4
-// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[TMP17]], ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
@@ -397,9 +397,9 @@ int main() {
 // CHECK3-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 24, i32 4, ptr @.omp_task_entry.)
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1023
+// CHECK3-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1023
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1
 // CHECK3-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i32
 // CHECK3-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i32
@@ -587,9 +587,9 @@ int main() {
 // CHECK9-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 8, ptr @.omp_task_entry.)
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i64 0, i64 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0
+// CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i64 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK9-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1023
+// CHECK9-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i64 1023
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1
 // CHECK9-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK9-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i64
@@ -709,9 +709,9 @@ int main() {
 // CHECK11-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 24, i32 4, ptr @.omp_task_entry.)
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x %struct.kmp_task_affinity_info_t], ptr [[DOTAFFS_ARR_ADDR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+// CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK11-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1023
+// CHECK11-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 1023
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[ARRAYIDX1]], i32 1
 // CHECK11-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i32
 // CHECK11-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP6]] to i32
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
index 6f671dbb27abb..2c36b410af064 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -91,16 +91,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -130,7 +130,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -145,19 +145,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -435,16 +435,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -474,7 +474,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP23]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -489,19 +489,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..4, ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9
 // CHECK1-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8
@@ -822,9 +822,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp
index 5e038989ab6dd..c8211f475c7fc 100644
--- a/clang/test/OpenMP/target_update_codegen.cpp
+++ b/clang/test/OpenMP/target_update_codegen.cpp
@@ -1118,9 +1118,9 @@ struct ST {
 void foo(int arg) {
   ST arr[3][4];
   // CK20: [[DIMS:%.+]] = alloca [3 x [[STRUCT_DESCRIPTOR]]],
-  // CK20: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [[STRUCT_ST]]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK20: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [[STRUCT_ST]]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK20: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [[STRUCT_ST]]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-  // CK20: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [[STRUCT_ST]], ptr [[ARRAY_DECAY]], {{.+}}
+  // CK20: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [[STRUCT_ST]], ptr [[ARRAY_DECAY]], {{.+}}
   // CK20: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
   // CK20: store ptr [[ARR]], ptr [[BP0]],
   // CK20: [[P0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[P:%.+]], {{.+}} 0, {{.+}} 0
@@ -1186,9 +1186,9 @@ struct ST {
   // CK21: _ZN2ST3fooEv
   void foo() {
     // CK21: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
-    // CK21: [[ARRAY_IDX:%.+]] = getelementptr inbounds [10 x [10 x [10 x ptr]]], ptr [[DPTR:%.+]], {{.+}} 0, {{.+}} 0
+    // CK21: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [10 x [10 x [10 x ptr]]], ptr [[DPTR:%.+]], {{.+}} 0, {{.+}} 0
     // CK21: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [10 x [10 x ptr]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-    // CK21: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_DECAY]], {{.+}} 1
+    // CK21: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [10 x ptr], ptr [[ARRAY_DECAY]], {{.+}} 1
     // CK21: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0
     // CK21: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 0
     // CK21: [[BP0:%.+]] = getelementptr inbounds [2 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
@@ -1262,9 +1262,9 @@ struct ST {
   // CK22: _ZN2ST3fooEPA10_Pi
   void foo(int *arr[5][10]) {
     // CK22: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
-    // CK22: [[ARRAY_IDX:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARR:%.+]], {{.+}} 0
+    // CK22: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [10 x ptr], ptr [[ARR:%.+]], {{.+}} 0
     // CK22: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [10 x ptr], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-    // CK22: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds ptr, ptr [[ARRAY_DECAY:%.+]], {{.+}} 1
+    // CK22: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw ptr, ptr [[ARRAY_DECAY:%.+]], {{.+}} 1
     // CK22: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
     // CK22: [[P0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[P:%.+]], i{{.+}} 0, i{{.+}} 0
     // CK22: [[DIM_1:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 0
@@ -1338,11 +1338,11 @@ void foo(int arg) {
   float farr[5][5][5];
   // CK23: [[ARG_ADDR:%.+]] = alloca i32,
   // CK23: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
-  // CK23: [[ARRAY_IDX:%.+]] = getelementptr inbounds [5 x [5 x [5 x float]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK23: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [5 x [5 x [5 x float]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK23: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [5 x [5 x float]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-  // CK23: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x float], ptr [[ARRAY_DECAY]], {{.+}}
+  // CK23: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x float], ptr [[ARRAY_DECAY]], {{.+}}
   // CK23: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x float], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0
-  // CK23: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds float, ptr [[ARRAY_DECAY_2]], {{.+}}
+  // CK23: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw float, ptr [[ARRAY_DECAY_2]], {{.+}}
   // CK23: [[MUL:%.+]] = mul nuw i64 4,
   // CK23: [[BP0:%.+]] = getelementptr inbounds [1 x ptr], ptr [[BP:%.+]], {{.+}} 0, {{.+}} 0
   // CK23: store ptr [[ARR]], ptr [[BP0]],
@@ -1411,11 +1411,11 @@ void foo(int arg) {
 void foo(int arg) {
   double darr[3][4][5];
   // CK24: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
-  // CK24: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [5 x double]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK24: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [5 x double]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK24: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [5 x double]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-  // CK24: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x double], ptr [[ARRAY_DECAY]], {{.+}}
+  // CK24: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x double], ptr [[ARRAY_DECAY]], {{.+}}
   // CK24: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x double], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0
-  // CK24: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds double, ptr [[ARRAY_DECAY_2]], {{.+}}
+  // CK24: [[ARRAY_IDX_2:%.+]] = getelementptr inbounds nuw double, ptr [[ARRAY_DECAY_2]], {{.+}}
   // CK24: [[MUL:%.+]] = mul nuw i64 8,
   // CK24: [[SUB:%.+]] = sub nuw i64 4, [[ARG:%.+]]
   // CK24: [[LEN:%.+]] = udiv {{.+}} [[SUB]], 1
@@ -1488,15 +1488,15 @@ void foo(int arg) {
 
   // CK25: [[DIMS:%.+]] = alloca [4 x [[STRUCT_DESCRIPTOR]]],
   // CK25: [[DIMS_2:%.+]] = alloca [3 x [[STRUCT_DESCRIPTOR]]],
-  // CK25: [[ARRAY_IDX:%.+]] = getelementptr inbounds [3 x [4 x [5 x i32]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK25: [[ARRAY_IDX:%.+]] = getelementptr inbounds nuw [3 x [4 x [5 x i32]]], ptr [[ARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK25: [[ARRAY_DECAY:%.+]] = getelementptr inbounds [4 x [5 x i32]], ptr [[ARRAY_IDX]], {{.+}} 0, {{.+}} 0
-  // CK25: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds [5 x i32], ptr [[ARRAY_DECAY]], {{.+}}
+  // CK25: [[ARRAY_IDX_1:%.+]] = getelementptr inbounds nuw [5 x i32], ptr [[ARRAY_DECAY]], {{.+}}
   // CK25: [[ARRAY_DECAY_2:%.+]] = getelementptr inbounds [5 x i32], ptr [[ARRAY_IDX_1]], {{.+}} 0, {{.+}} 0
-  // CK25: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 1
+  // CK25: [[ARRAY_IDX_3:%.+]] = getelementptr inbounds nuw {{.+}}, ptr [[ARRAY_DECAY_2]], {{.+}} 1
   // CK25: [[LEN:%.+]] = sub nuw i64 4, [[ARG_ADDR:%.+]]
-  // CK25: [[ARRAY_IDX_4:%.+]] = getelementptr inbounds [4 x [3 x float]], ptr [[FARR:%.+]], {{.+}} 0, {{.+}} 0
+  // CK25: [[ARRAY_IDX_4:%.+]] = getelementptr inbounds nuw [4 x [3 x float]], ptr [[FARR:%.+]], {{.+}} 0, {{.+}} 0
   // CK25: [[ARRAY_DECAY_5:%.+]] = getelementptr inbounds [3 x float], ptr [[ARRAY_IDX_4]], {{.+}} 0, {{.+}} 0
-  // CK25: [[ARRAY_IDX_6:%.+]] = getelementptr inbounds float, ptr [[ARRAY_DECAY_5:%.+]], {{.+}} 1
+  // CK25: [[ARRAY_IDX_6:%.+]] = getelementptr inbounds nuw float, ptr [[ARRAY_DECAY_5:%.+]], {{.+}} 1
   // CK25: [[BP0:%.+]] = getelementptr inbounds [3 x ptr], ptr [[BP:%.+]], i{{.+}} 0, i{{.+}} 0
   // CK25: [[P0:%.+]] = getelementptr inbounds [3 x ptr], ptr [[P:%.+]], i{{.+}} 0, i{{.+}} 0
   // CK25: [[DIM_1:%.+]] = getelementptr inbounds [4 x [[STRUCT_DESCRIPTOR]]], ptr [[DIMS]], {{.+}} 0, {{.+}} 0
diff --git a/clang/test/OpenMP/task_codegen.c b/clang/test/OpenMP/task_codegen.c
index 0d10cbce4aa80..d08eb3762d5c9 100644
--- a/clang/test/OpenMP/task_codegen.c
+++ b/clang/test/OpenMP/task_codegen.c
@@ -183,7 +183,7 @@ for (int i = 0; i < 10; ++i)
   // CHECK: [[A:%.+]] = load ptr, ptr [[A_ADDR:%.+]],
   // CHECK: [[K:%.+]] = load i32, ptr [[K_ADDR]],
   // CHECK: [[IDX:%.+]] = zext i32 [[K]] to i64
-  // CHECK: [[AK_ADDR:%.+]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IDX]]
+  // CHECK: [[AK_ADDR:%.+]] = getelementptr inbounds nuw ptr, ptr [[A]], i64 [[IDX]]
   // CHECK: [[AK:%.+]] = load ptr, ptr [[AK_ADDR]],
   // CHECK: [[I:%.+]] = load i32, ptr [[I_ADDR]],
   // CHECK: [[IDX:%.+]] = sext i32 [[I]] to i64
diff --git a/clang/test/OpenMP/task_codegen.cpp b/clang/test/OpenMP/task_codegen.cpp
index b256c41132ed3..c3e6d9e6b1cf7 100644
--- a/clang/test/OpenMP/task_codegen.cpp
+++ b/clang/test/OpenMP/task_codegen.cpp
@@ -309,9 +309,9 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2
 // CHECK1-NEXT:    store i8 1, ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]]
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1
 // CHECK1-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
@@ -346,13 +346,13 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP58:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-NEXT:    [[TMP59:%.*]] = sext i8 [[TMP58]] to i64
 // CHECK1-NEXT:    [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
 // CHECK1-NEXT:    [[TMP61:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-NEXT:    [[TMP62:%.*]] = sext i8 [[TMP61]] to i64
 // CHECK1-NEXT:    [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1
 // CHECK1-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK1-NEXT:    [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64
@@ -384,13 +384,13 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP83:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-NEXT:    [[TMP84:%.*]] = sext i8 [[TMP83]] to i64
 // CHECK1-NEXT:    [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
 // CHECK1-NEXT:    [[TMP86:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-NEXT:    [[TMP87:%.*]] = sext i8 [[TMP86]] to i64
 // CHECK1-NEXT:    [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
 // CHECK1-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1
 // CHECK1-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64
 // CHECK1-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64
@@ -427,8 +427,8 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2
 // CHECK1-NEXT:    store i8 3, ptr [[TMP111]], align 8
 // CHECK1-NEXT:    [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3
 // CHECK1-NEXT:    [[TMP113:%.*]] = load i32, ptr @a, align 4
 // CHECK1-NEXT:    [[TMP114:%.*]] = sext i32 [[TMP113]] to i64
 // CHECK1-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1
@@ -436,8 +436,8 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP116:%.*]] = sext i32 [[TMP115]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]]
 // CHECK1-NEXT:    [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]]
-// CHECK1-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
+// CHECK1-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
 // CHECK1-NEXT:    [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK1-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK1-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64
@@ -1432,9 +1432,9 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2
 // CHECK1-51-NEXT:    store i8 1, ptr [[TMP33]], align 8
 // CHECK1-51-NEXT:    [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK1-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK1-51-NEXT:    [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]]
+// CHECK1-51-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]]
 // CHECK1-51-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1
 // CHECK1-51-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK1-51-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
@@ -1469,13 +1469,13 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP58:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-51-NEXT:    [[TMP59:%.*]] = sext i8 [[TMP58]] to i64
 // CHECK1-51-NEXT:    [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]]
-// CHECK1-51-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
+// CHECK1-51-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]]
+// CHECK1-51-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
 // CHECK1-51-NEXT:    [[TMP61:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-51-NEXT:    [[TMP62:%.*]] = sext i8 [[TMP61]] to i64
 // CHECK1-51-NEXT:    [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]]
-// CHECK1-51-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
+// CHECK1-51-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]]
+// CHECK1-51-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
 // CHECK1-51-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1
 // CHECK1-51-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK1-51-NEXT:    [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64
@@ -1507,13 +1507,13 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP83:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-51-NEXT:    [[TMP84:%.*]] = sext i8 [[TMP83]] to i64
 // CHECK1-51-NEXT:    [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]]
-// CHECK1-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
+// CHECK1-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]]
+// CHECK1-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
 // CHECK1-51-NEXT:    [[TMP86:%.*]] = load i8, ptr [[B]], align 1
 // CHECK1-51-NEXT:    [[TMP87:%.*]] = sext i8 [[TMP86]] to i64
 // CHECK1-51-NEXT:    [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]]
-// CHECK1-51-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
+// CHECK1-51-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]]
+// CHECK1-51-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
 // CHECK1-51-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1
 // CHECK1-51-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64
 // CHECK1-51-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64
@@ -1550,8 +1550,8 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2
 // CHECK1-51-NEXT:    store i8 3, ptr [[TMP111]], align 8
 // CHECK1-51-NEXT:    [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]]
-// CHECK1-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3
+// CHECK1-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]]
+// CHECK1-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3
 // CHECK1-51-NEXT:    [[TMP113:%.*]] = load i32, ptr @a, align 4
 // CHECK1-51-NEXT:    [[TMP114:%.*]] = sext i32 [[TMP113]] to i64
 // CHECK1-51-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1
@@ -1559,8 +1559,8 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP116:%.*]] = sext i32 [[TMP115]] to i64
 // CHECK1-51-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]]
 // CHECK1-51-NEXT:    [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]]
-// CHECK1-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
+// CHECK1-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]]
+// CHECK1-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
 // CHECK1-51-NEXT:    [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK1-51-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK1-51-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64
@@ -1595,8 +1595,8 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP139:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP136]], i32 0, i32 2
 // CHECK1-51-NEXT:    store i8 8, ptr [[TMP139]], align 8
 // CHECK1-51-NEXT:    [[TMP140:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP140]]
-// CHECK1-51-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX31]], i64 3
+// CHECK1-51-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP140]]
+// CHECK1-51-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX31]], i64 3
 // CHECK1-51-NEXT:    [[TMP141:%.*]] = load i32, ptr @a, align 4
 // CHECK1-51-NEXT:    [[TMP142:%.*]] = sext i32 [[TMP141]] to i64
 // CHECK1-51-NEXT:    [[LEN_SUB_133:%.*]] = sub nsw i64 [[TMP142]], 1
@@ -1604,8 +1604,8 @@ void test_omp_all_memory()
 // CHECK1-51-NEXT:    [[TMP144:%.*]] = sext i32 [[TMP143]] to i64
 // CHECK1-51-NEXT:    [[LB_ADD_LEN34:%.*]] = add nsw i64 -1, [[TMP144]]
 // CHECK1-51-NEXT:    [[TMP145:%.*]] = mul nsw i64 [[LB_ADD_LEN34]], [[TMP2]]
-// CHECK1-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP145]]
-// CHECK1-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]]
+// CHECK1-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP145]]
+// CHECK1-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]]
 // CHECK1-51-NEXT:    [[TMP146:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK1-51-NEXT:    [[TMP147:%.*]] = ptrtoint ptr [[ARRAYIDX32]] to i64
 // CHECK1-51-NEXT:    [[TMP148:%.*]] = ptrtoint ptr [[TMP146]] to i64
@@ -3040,9 +3040,9 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2
 // CHECK2-NEXT:    store i8 1, ptr [[TMP33]], align 8
 // CHECK2-NEXT:    [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK2-NEXT:    [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]]
+// CHECK2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]]
 // CHECK2-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1
 // CHECK2-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK2-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
@@ -3077,13 +3077,13 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP58:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-NEXT:    [[TMP59:%.*]] = sext i8 [[TMP58]] to i64
 // CHECK2-NEXT:    [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]]
-// CHECK2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
+// CHECK2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]]
+// CHECK2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
 // CHECK2-NEXT:    [[TMP61:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-NEXT:    [[TMP62:%.*]] = sext i8 [[TMP61]] to i64
 // CHECK2-NEXT:    [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]]
-// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
+// CHECK2-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]]
+// CHECK2-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
 // CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1
 // CHECK2-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK2-NEXT:    [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64
@@ -3115,13 +3115,13 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP83:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-NEXT:    [[TMP84:%.*]] = sext i8 [[TMP83]] to i64
 // CHECK2-NEXT:    [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]]
-// CHECK2-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
+// CHECK2-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]]
+// CHECK2-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
 // CHECK2-NEXT:    [[TMP86:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-NEXT:    [[TMP87:%.*]] = sext i8 [[TMP86]] to i64
 // CHECK2-NEXT:    [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]]
-// CHECK2-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
+// CHECK2-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]]
+// CHECK2-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
 // CHECK2-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1
 // CHECK2-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64
 // CHECK2-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64
@@ -3158,8 +3158,8 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2
 // CHECK2-NEXT:    store i8 3, ptr [[TMP111]], align 8
 // CHECK2-NEXT:    [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]]
-// CHECK2-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3
+// CHECK2-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]]
+// CHECK2-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3
 // CHECK2-NEXT:    [[TMP113:%.*]] = load i32, ptr @a, align 4
 // CHECK2-NEXT:    [[TMP114:%.*]] = sext i32 [[TMP113]] to i64
 // CHECK2-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1
@@ -3167,8 +3167,8 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP116:%.*]] = sext i32 [[TMP115]] to i64
 // CHECK2-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]]
 // CHECK2-NEXT:    [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]]
-// CHECK2-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]]
-// CHECK2-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
+// CHECK2-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]]
+// CHECK2-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
 // CHECK2-NEXT:    [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK2-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK2-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64
@@ -4163,9 +4163,9 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP30]], i32 0, i32 2
 // CHECK2-51-NEXT:    store i8 1, ptr [[TMP33]], align 8
 // CHECK2-51-NEXT:    [[TMP34:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK2-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK2-51-NEXT:    [[TMP35:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP35]]
+// CHECK2-51-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP35]]
 // CHECK2-51-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[ARRAYIDX2]], i32 1
 // CHECK2-51-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK2-51-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
@@ -4200,13 +4200,13 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP58:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-51-NEXT:    [[TMP59:%.*]] = sext i8 [[TMP58]] to i64
 // CHECK2-51-NEXT:    [[TMP60:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP60]]
-// CHECK2-51-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
+// CHECK2-51-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP60]]
+// CHECK2-51-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX7]], i64 [[TMP59]]
 // CHECK2-51-NEXT:    [[TMP61:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-51-NEXT:    [[TMP62:%.*]] = sext i8 [[TMP61]] to i64
 // CHECK2-51-NEXT:    [[TMP63:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP63]]
-// CHECK2-51-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
+// CHECK2-51-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP63]]
+// CHECK2-51-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX9]], i64 [[TMP62]]
 // CHECK2-51-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ARRAYIDX10]], i32 1
 // CHECK2-51-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARRAYIDX8]] to i64
 // CHECK2-51-NEXT:    [[TMP66:%.*]] = ptrtoint ptr [[TMP64]] to i64
@@ -4238,13 +4238,13 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP83:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-51-NEXT:    [[TMP84:%.*]] = sext i8 [[TMP83]] to i64
 // CHECK2-51-NEXT:    [[TMP85:%.*]] = mul nsw i64 4, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP85]]
-// CHECK2-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
+// CHECK2-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP85]]
+// CHECK2-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP84]]
 // CHECK2-51-NEXT:    [[TMP86:%.*]] = load i8, ptr [[B]], align 1
 // CHECK2-51-NEXT:    [[TMP87:%.*]] = sext i8 [[TMP86]] to i64
 // CHECK2-51-NEXT:    [[TMP88:%.*]] = mul nsw i64 9, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP88]]
-// CHECK2-51-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
+// CHECK2-51-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP88]]
+// CHECK2-51-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX17]], i64 [[TMP87]]
 // CHECK2-51-NEXT:    [[TMP89:%.*]] = getelementptr i32, ptr [[ARRAYIDX18]], i32 1
 // CHECK2-51-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[ARRAYIDX16]] to i64
 // CHECK2-51-NEXT:    [[TMP91:%.*]] = ptrtoint ptr [[TMP89]] to i64
@@ -4281,8 +4281,8 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP111:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP108]], i32 0, i32 2
 // CHECK2-51-NEXT:    store i8 3, ptr [[TMP111]], align 8
 // CHECK2-51-NEXT:    [[TMP112:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP112]]
-// CHECK2-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 3
+// CHECK2-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP112]]
+// CHECK2-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 3
 // CHECK2-51-NEXT:    [[TMP113:%.*]] = load i32, ptr @a, align 4
 // CHECK2-51-NEXT:    [[TMP114:%.*]] = sext i32 [[TMP113]] to i64
 // CHECK2-51-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP114]], 1
@@ -4290,8 +4290,8 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP116:%.*]] = sext i32 [[TMP115]] to i64
 // CHECK2-51-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP116]]
 // CHECK2-51-NEXT:    [[TMP117:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP117]]
-// CHECK2-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
+// CHECK2-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP117]]
+// CHECK2-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[LEN_SUB_1]]
 // CHECK2-51-NEXT:    [[TMP118:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK2-51-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK2-51-NEXT:    [[TMP120:%.*]] = ptrtoint ptr [[TMP118]] to i64
@@ -4326,8 +4326,8 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP139:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP136]], i32 0, i32 2
 // CHECK2-51-NEXT:    store i8 8, ptr [[TMP139]], align 8
 // CHECK2-51-NEXT:    [[TMP140:%.*]] = mul nsw i64 0, [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP140]]
-// CHECK2-51-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX31]], i64 3
+// CHECK2-51-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP140]]
+// CHECK2-51-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX31]], i64 3
 // CHECK2-51-NEXT:    [[TMP141:%.*]] = load i32, ptr @a, align 4
 // CHECK2-51-NEXT:    [[TMP142:%.*]] = sext i32 [[TMP141]] to i64
 // CHECK2-51-NEXT:    [[LEN_SUB_133:%.*]] = sub nsw i64 [[TMP142]], 1
@@ -4335,8 +4335,8 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP144:%.*]] = sext i32 [[TMP143]] to i64
 // CHECK2-51-NEXT:    [[LB_ADD_LEN34:%.*]] = add nsw i64 -1, [[TMP144]]
 // CHECK2-51-NEXT:    [[TMP145:%.*]] = mul nsw i64 [[LB_ADD_LEN34]], [[TMP2]]
-// CHECK2-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP145]]
-// CHECK2-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]]
+// CHECK2-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP145]]
+// CHECK2-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_133]]
 // CHECK2-51-NEXT:    [[TMP146:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK2-51-NEXT:    [[TMP147:%.*]] = ptrtoint ptr [[ARRAYIDX32]] to i64
 // CHECK2-51-NEXT:    [[TMP148:%.*]] = ptrtoint ptr [[TMP146]] to i64
@@ -5773,9 +5773,9 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2
 // CHECK3-NEXT:    store i8 1, ptr [[TMP32]], align 8
 // CHECK3-NEXT:    [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]]
+// CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]]
 // CHECK3-NEXT:    [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1
 // CHECK3-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK3-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64
@@ -5814,13 +5814,13 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP57:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-NEXT:    [[TMP58:%.*]] = sext i8 [[TMP57]] to i64
 // CHECK3-NEXT:    [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]]
-// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
+// CHECK3-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]]
+// CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
 // CHECK3-NEXT:    [[TMP60:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-NEXT:    [[TMP61:%.*]] = sext i8 [[TMP60]] to i64
 // CHECK3-NEXT:    [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]]
-// CHECK3-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
+// CHECK3-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]]
+// CHECK3-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1
 // CHECK3-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK3-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64
@@ -5854,13 +5854,13 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP82:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-NEXT:    [[TMP83:%.*]] = sext i8 [[TMP82]] to i64
 // CHECK3-NEXT:    [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]]
-// CHECK3-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
+// CHECK3-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]]
+// CHECK3-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
 // CHECK3-NEXT:    [[TMP85:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-NEXT:    [[TMP86:%.*]] = sext i8 [[TMP85]] to i64
 // CHECK3-NEXT:    [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]]
-// CHECK3-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
+// CHECK3-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]]
+// CHECK3-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
 // CHECK3-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK3-NEXT:    [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK3-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64
@@ -5899,8 +5899,8 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2
 // CHECK3-NEXT:    store i8 3, ptr [[TMP110]], align 8
 // CHECK3-NEXT:    [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]]
-// CHECK3-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3
+// CHECK3-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]]
+// CHECK3-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3
 // CHECK3-NEXT:    [[TMP112:%.*]] = load i32, ptr @a, align 4
 // CHECK3-NEXT:    [[TMP113:%.*]] = sext i32 [[TMP112]] to i64
 // CHECK3-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1
@@ -5908,8 +5908,8 @@ void test_omp_all_memory()
 // CHECK3-NEXT:    [[TMP115:%.*]] = sext i32 [[TMP114]] to i64
 // CHECK3-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]]
 // CHECK3-NEXT:    [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]]
-// CHECK3-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]]
-// CHECK3-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
+// CHECK3-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]]
+// CHECK3-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
 // CHECK3-NEXT:    [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK3-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64
 // CHECK3-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64
@@ -6789,9 +6789,9 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2
 // CHECK4-NEXT:    store i8 1, ptr [[TMP32]], align 8
 // CHECK4-NEXT:    [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]]
+// CHECK4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]]
 // CHECK4-NEXT:    [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK4-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK4-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1
 // CHECK4-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK4-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64
@@ -6830,13 +6830,13 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP57:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-NEXT:    [[TMP58:%.*]] = sext i8 [[TMP57]] to i64
 // CHECK4-NEXT:    [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]]
-// CHECK4-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
+// CHECK4-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]]
+// CHECK4-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
 // CHECK4-NEXT:    [[TMP60:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-NEXT:    [[TMP61:%.*]] = sext i8 [[TMP60]] to i64
 // CHECK4-NEXT:    [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]]
-// CHECK4-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
+// CHECK4-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]]
+// CHECK4-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
 // CHECK4-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1
 // CHECK4-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK4-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64
@@ -6870,13 +6870,13 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP82:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-NEXT:    [[TMP83:%.*]] = sext i8 [[TMP82]] to i64
 // CHECK4-NEXT:    [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]]
-// CHECK4-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
+// CHECK4-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]]
+// CHECK4-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
 // CHECK4-NEXT:    [[TMP85:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-NEXT:    [[TMP86:%.*]] = sext i8 [[TMP85]] to i64
 // CHECK4-NEXT:    [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]]
-// CHECK4-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
+// CHECK4-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]]
+// CHECK4-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
 // CHECK4-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK4-NEXT:    [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK4-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64
@@ -6915,8 +6915,8 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2
 // CHECK4-NEXT:    store i8 3, ptr [[TMP110]], align 8
 // CHECK4-NEXT:    [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]]
-// CHECK4-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3
+// CHECK4-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]]
+// CHECK4-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3
 // CHECK4-NEXT:    [[TMP112:%.*]] = load i32, ptr @a, align 4
 // CHECK4-NEXT:    [[TMP113:%.*]] = sext i32 [[TMP112]] to i64
 // CHECK4-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1
@@ -6924,8 +6924,8 @@ void test_omp_all_memory()
 // CHECK4-NEXT:    [[TMP115:%.*]] = sext i32 [[TMP114]] to i64
 // CHECK4-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]]
 // CHECK4-NEXT:    [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]]
-// CHECK4-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]]
-// CHECK4-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
+// CHECK4-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]]
+// CHECK4-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
 // CHECK4-NEXT:    [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK4-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64
 // CHECK4-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64
@@ -7808,9 +7808,9 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2
 // CHECK3-51-NEXT:    store i8 1, ptr [[TMP32]], align 8
 // CHECK3-51-NEXT:    [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]]
+// CHECK3-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]]
 // CHECK3-51-NEXT:    [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK3-51-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK3-51-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1
 // CHECK3-51-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK3-51-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64
@@ -7849,13 +7849,13 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP57:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-51-NEXT:    [[TMP58:%.*]] = sext i8 [[TMP57]] to i64
 // CHECK3-51-NEXT:    [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]]
-// CHECK3-51-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
+// CHECK3-51-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]]
+// CHECK3-51-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
 // CHECK3-51-NEXT:    [[TMP60:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-51-NEXT:    [[TMP61:%.*]] = sext i8 [[TMP60]] to i64
 // CHECK3-51-NEXT:    [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]]
-// CHECK3-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
+// CHECK3-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]]
+// CHECK3-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
 // CHECK3-51-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1
 // CHECK3-51-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK3-51-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64
@@ -7889,13 +7889,13 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP82:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-51-NEXT:    [[TMP83:%.*]] = sext i8 [[TMP82]] to i64
 // CHECK3-51-NEXT:    [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]]
-// CHECK3-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
+// CHECK3-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]]
+// CHECK3-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
 // CHECK3-51-NEXT:    [[TMP85:%.*]] = load i8, ptr [[B]], align 1
 // CHECK3-51-NEXT:    [[TMP86:%.*]] = sext i8 [[TMP85]] to i64
 // CHECK3-51-NEXT:    [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]]
-// CHECK3-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
+// CHECK3-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]]
+// CHECK3-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
 // CHECK3-51-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK3-51-NEXT:    [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK3-51-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64
@@ -7934,8 +7934,8 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2
 // CHECK3-51-NEXT:    store i8 3, ptr [[TMP110]], align 8
 // CHECK3-51-NEXT:    [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]]
-// CHECK3-51-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3
+// CHECK3-51-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]]
+// CHECK3-51-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3
 // CHECK3-51-NEXT:    [[TMP112:%.*]] = load i32, ptr @a, align 4
 // CHECK3-51-NEXT:    [[TMP113:%.*]] = sext i32 [[TMP112]] to i64
 // CHECK3-51-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1
@@ -7943,8 +7943,8 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP115:%.*]] = sext i32 [[TMP114]] to i64
 // CHECK3-51-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]]
 // CHECK3-51-NEXT:    [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]]
-// CHECK3-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
+// CHECK3-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]]
+// CHECK3-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
 // CHECK3-51-NEXT:    [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK3-51-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64
 // CHECK3-51-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64
@@ -7981,8 +7981,8 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP138:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP135]], i32 0, i32 2
 // CHECK3-51-NEXT:    store i8 8, ptr [[TMP138]], align 8
 // CHECK3-51-NEXT:    [[TMP139:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP139]]
-// CHECK3-51-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX43]], i64 3
+// CHECK3-51-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP139]]
+// CHECK3-51-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX43]], i64 3
 // CHECK3-51-NEXT:    [[TMP140:%.*]] = load i32, ptr @a, align 4
 // CHECK3-51-NEXT:    [[TMP141:%.*]] = sext i32 [[TMP140]] to i64
 // CHECK3-51-NEXT:    [[LEN_SUB_145:%.*]] = sub nsw i64 [[TMP141]], 1
@@ -7990,8 +7990,8 @@ void test_omp_all_memory()
 // CHECK3-51-NEXT:    [[TMP143:%.*]] = sext i32 [[TMP142]] to i64
 // CHECK3-51-NEXT:    [[LB_ADD_LEN46:%.*]] = add nsw i64 -1, [[TMP143]]
 // CHECK3-51-NEXT:    [[TMP144:%.*]] = mul nsw i64 [[LB_ADD_LEN46]], [[TMP1]]
-// CHECK3-51-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP144]]
-// CHECK3-51-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX47]], i64 [[LEN_SUB_145]]
+// CHECK3-51-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP144]]
+// CHECK3-51-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX47]], i64 [[LEN_SUB_145]]
 // CHECK3-51-NEXT:    [[TMP145:%.*]] = getelementptr i32, ptr [[ARRAYIDX48]], i32 1
 // CHECK3-51-NEXT:    [[TMP146:%.*]] = ptrtoint ptr [[ARRAYIDX44]] to i64
 // CHECK3-51-NEXT:    [[TMP147:%.*]] = ptrtoint ptr [[TMP145]] to i64
@@ -9323,9 +9323,9 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP29]], i32 0, i32 2
 // CHECK4-51-NEXT:    store i8 1, ptr [[TMP32]], align 8
 // CHECK4-51-NEXT:    [[TMP33:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP33]]
+// CHECK4-51-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP33]]
 // CHECK4-51-NEXT:    [[TMP34:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP34]]
+// CHECK4-51-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP34]]
 // CHECK4-51-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[ARRAYIDX4]], i32 1
 // CHECK4-51-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARRAYIDX]] to i64
 // CHECK4-51-NEXT:    [[TMP37:%.*]] = ptrtoint ptr [[TMP35]] to i64
@@ -9364,13 +9364,13 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP57:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-51-NEXT:    [[TMP58:%.*]] = sext i8 [[TMP57]] to i64
 // CHECK4-51-NEXT:    [[TMP59:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP59]]
-// CHECK4-51-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
+// CHECK4-51-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP59]]
+// CHECK4-51-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX13]], i64 [[TMP58]]
 // CHECK4-51-NEXT:    [[TMP60:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-51-NEXT:    [[TMP61:%.*]] = sext i8 [[TMP60]] to i64
 // CHECK4-51-NEXT:    [[TMP62:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP62]]
-// CHECK4-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
+// CHECK4-51-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP62]]
+// CHECK4-51-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX15]], i64 [[TMP61]]
 // CHECK4-51-NEXT:    [[TMP63:%.*]] = getelementptr i32, ptr [[ARRAYIDX16]], i32 1
 // CHECK4-51-NEXT:    [[TMP64:%.*]] = ptrtoint ptr [[ARRAYIDX14]] to i64
 // CHECK4-51-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[TMP63]] to i64
@@ -9404,13 +9404,13 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP82:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-51-NEXT:    [[TMP83:%.*]] = sext i8 [[TMP82]] to i64
 // CHECK4-51-NEXT:    [[TMP84:%.*]] = mul nsw i64 4, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP84]]
-// CHECK4-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
+// CHECK4-51-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP84]]
+// CHECK4-51-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX23]], i64 [[TMP83]]
 // CHECK4-51-NEXT:    [[TMP85:%.*]] = load i8, ptr [[B]], align 1
 // CHECK4-51-NEXT:    [[TMP86:%.*]] = sext i8 [[TMP85]] to i64
 // CHECK4-51-NEXT:    [[TMP87:%.*]] = mul nsw i64 9, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP87]]
-// CHECK4-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
+// CHECK4-51-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP87]]
+// CHECK4-51-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX25]], i64 [[TMP86]]
 // CHECK4-51-NEXT:    [[TMP88:%.*]] = getelementptr i32, ptr [[ARRAYIDX26]], i32 1
 // CHECK4-51-NEXT:    [[TMP89:%.*]] = ptrtoint ptr [[ARRAYIDX24]] to i64
 // CHECK4-51-NEXT:    [[TMP90:%.*]] = ptrtoint ptr [[TMP88]] to i64
@@ -9449,8 +9449,8 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_DEPEND_INFO]], ptr [[TMP107]], i32 0, i32 2
 // CHECK4-51-NEXT:    store i8 3, ptr [[TMP110]], align 8
 // CHECK4-51-NEXT:    [[TMP111:%.*]] = mul nsw i64 0, [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP111]]
-// CHECK4-51-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX33]], i64 3
+// CHECK4-51-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP111]]
+// CHECK4-51-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX33]], i64 3
 // CHECK4-51-NEXT:    [[TMP112:%.*]] = load i32, ptr @a, align 4
 // CHECK4-51-NEXT:    [[TMP113:%.*]] = sext i32 [[TMP112]] to i64
 // CHECK4-51-NEXT:    [[LEN_SUB_1:%.*]] = sub nsw i64 [[TMP113]], 1
@@ -9458,8 +9458,8 @@ void test_omp_all_memory()
 // CHECK4-51-NEXT:    [[TMP115:%.*]] = sext i32 [[TMP114]] to i64
 // CHECK4-51-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP115]]
 // CHECK4-51-NEXT:    [[TMP116:%.*]] = mul nsw i64 [[LB_ADD_LEN]], [[TMP1]]
-// CHECK4-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 [[TMP116]]
-// CHECK4-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
+// CHECK4-51-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i32, ptr [[VLA]], i64 [[TMP116]]
+// CHECK4-51-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX35]], i64 [[LEN_SUB_1]]
 // CHECK4-51-NEXT:    [[TMP117:%.*]] = getelementptr i32, ptr [[ARRAYIDX36]], i32 1
 // CHECK4-51-NEXT:    [[TMP118:%.*]] = ptrtoint ptr [[ARRAYIDX34]] to i64
 // CHECK4-51-NEXT:    [[TMP119:%.*]] = ptrtoint ptr [[TMP117]] to i64
diff --git a/clang/test/OpenMP/task_in_reduction_codegen.cpp b/clang/test/OpenMP/task_in_reduction_codegen.cpp
index aa2a478137990..29dc12978d7d9 100644
--- a/clang/test/OpenMP/task_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/task_in_reduction_codegen.cpp
@@ -90,7 +90,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -105,7 +105,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -120,7 +120,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -138,7 +138,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -153,7 +153,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp b/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp
index faf86479dfdae..a8577c7a13579 100644
--- a/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskgroup_task_reduction_codegen.cpp
@@ -67,7 +67,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[A_REF]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[A]], ptr [[A_REF:[^,]+]],
 // CHECK-DAG:   [[A_REF]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 1
-// CHECK-DAG:   [[GEPA]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
+// CHECK-DAG:   [[GEPA]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
 // CHECK-DAG:   [[TMP6:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 2
 // CHECK-DAG:   store i64 4, ptr [[TMP6]],
 // CHECK-DAG:   [[TMP7:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPA]], i32 0, i32 3
@@ -82,7 +82,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP12]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[B]], ptr [[TMP12:%[^,]+]],
 // CHECK-DAG:   [[TMP12]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 1
-// CHECK-DAG:   [[GEPB]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
+// CHECK-DAG:   [[GEPB]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
 // CHECK-DAG:   [[TMP14:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 2
 // CHECK-DAG:   store i64 4, ptr [[TMP14]],
 // CHECK-DAG:   [[TMP15:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPB]], i32 0, i32 3
@@ -97,7 +97,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP20]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[ARGC_ADDR]], ptr [[TMP20:%[^,]+]],
 // CHECK-DAG:   [[TMP20]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 1
-// CHECK-DAG:   [[GEPARGC]] = getelementptr inbounds [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
+// CHECK-DAG:   [[GEPARGC]] = getelementptr inbounds nuw [3 x [[T1]]], ptr [[RD_IN1]], i64 0, i64
 // CHECK-DAG:   [[TMP22:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 2
 // CHECK-DAG:   store i64 4, ptr [[TMP22]],
 // CHECK-DAG:   [[TMP23:%.+]] = getelementptr inbounds nuw [[T1]], ptr [[GEPARGC]], i32 0, i32 3
@@ -116,7 +116,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP30]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[C]], ptr [[TMP30:%[^,]+]],
 // CHECK-DAG:   [[TMP30]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 1
-// CHECK-DAG:   [[GEPC]] = getelementptr inbounds [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64
+// CHECK-DAG:   [[GEPC]] = getelementptr inbounds nuw [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64
 // CHECK-DAG:   [[TMP32:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 2
 // CHECK-DAG:   store i64 20, ptr [[TMP32]],
 // CHECK-DAG:   [[TMP33:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPC]], i32 0, i32 3
@@ -131,7 +131,7 @@ int main(int argc, char **argv) {
 // CHECK-DAG:   [[TMP38]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA:%[^,]+]], i32 0, i32 0
 // CHECK-DAG:   store ptr [[VLA]], ptr [[TMP38:%[^,]+]],
 // CHECK-DAG:   [[TMP38]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA]], i32 0, i32 1
-// CHECK-DAG:   [[GEPVLA]] = getelementptr inbounds [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64
+// CHECK-DAG:   [[GEPVLA]] = getelementptr inbounds nuw [2 x [[T2]]], ptr [[RD_IN2]], i64 0, i64
 // CHECK-DAG:   [[TMP40:%.+]] = mul nuw i64 [[VLA_SIZE]], 2
 // CHECK-DAG:   [[TMP41:%.+]] = udiv exact i64 [[TMP40]], ptrtoint (ptr getelementptr (i16, ptr null, i32 1) to i64)
 // CHECK-DAG:   [[TMP42:%.+]] = getelementptr inbounds nuw [[T2]], ptr [[GEPVLA]], i32 0, i32 2
diff --git a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
index ae0d007756140..87b4cd2caf18a 100644
--- a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
@@ -76,7 +76,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/taskloop_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
index 3cdc88ba20b77..6eca033eca551 100644
--- a/clang/test/OpenMP/taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
@@ -83,9 +83,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -137,10 +137,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
index 6da28d2d973c9..9e4e51a442742 100644
--- a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
@@ -76,7 +76,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[VLA:%.*]] = alloca i16, i64 [[TMP2]], align 16
 // CHECK1-NEXT:    store i64 [[TMP2]], ptr [[__VLA_EXPR0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[A]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP10]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_1:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[B]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 1
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..2, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_1]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP17]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_2:%.*]] = getelementptr inbounds nuw [3 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_2]], i32 0, i32 1
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP25:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 3, ptr [[DOTRD_INPUT_]])
 // CHECK1-NEXT:    store ptr [[TMP25]], ptr [[DOTTASK_RED_]], align 8
 // CHECK1-NEXT:    call void @__kmpc_taskgroup(ptr @[[GLOB1]], i32 [[TMP0]])
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_4:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[C]], ptr [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..6, ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_4]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP32]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_5:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_5]], i32 0, i32 1
diff --git a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
index d6e40831484aa..83ae053cfd9bd 100644
--- a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
@@ -80,9 +80,9 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB1:.+]], ptr [[TMP25]],
 // CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK-DAG:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 4, i1 false)
-// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 0
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 0
 // CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
-// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw [100 x %struct.S], ptr [[C]], i64 0, i64 [[LB_ADD_LEN]]
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
 // CHECK-DAG:    [[TMP28]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
 // CHECK-DAG:    store ptr [[ARRAYIDX5]], ptr [[TMP28:%[^,]+]],
@@ -134,10 +134,10 @@ sum = 0.0;
 // CHECK-DAG:    store ptr @[[RED_COMB4:.+]], ptr [[TMP59]],
 // CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds nuw %struct.kmp_taskred_input_t, ptr [[DOTRD_INPUT_GEP_8]], i32 0, i32 6
 // CHECK-DAG:    store i32 1, ptr [[TMP60]],
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
-// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds nuw [4 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64
 // CHECK:    [[TMP62:%.*]] = call ptr @__kmpc_taskred_init(i32 [[TMP0]], i32 4, ptr [[DOTRD_INPUT_]])
 // CHECK:    [[TMP63:%.*]] = load i32, ptr [[N]],
 // CHECK:    store i32 [[TMP63]], ptr [[DOTCAPTURE_EXPR_]],
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
index be499e0b36548..7987c2de7dd8f 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -100,16 +100,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP1]], i64 0
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 0
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP4]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP5]], i64 9
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP20]]
 // CHECK1-NEXT:    store ptr [[_TMP5]], ptr [[TMP]], align 8
 // CHECK1-NEXT:    store ptr [[TMP21]], ptr [[_TMP5]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP22]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -154,19 +154,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb., ptr [[TMP27]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_6:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds ptr, ptr [[TMP30]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP30]], i64 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[ARRAYIDX7]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP31]], i64 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN9:%.*]] = add nsw i64 -1, [[TMP33]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr [[TMP34]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP34]], i64 9
 // CHECK1-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[ARRAYIDX10]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP35]], i64 [[LB_ADD_LEN9]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T]], ptr [[DOTRD_INPUT_GEP_6]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[TMP36]], align 8
@@ -444,16 +444,16 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[ARGC1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP3]], i64 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP4]], i64 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, [[TMP6]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr [[TMP7]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP7]], i64 9
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[LB_ADD_LEN]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARRAYIDX4]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARRAYIDX2]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP9]], [[TMP10]]
@@ -483,7 +483,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[VLA]], i64 [[TMP22]]
 // CHECK1-NEXT:    store ptr [[_TMP6]], ptr [[_TMP5]], align 8
 // CHECK1-NEXT:    store ptr [[TMP23]], ptr [[_TMP6]], align 8
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0:%.*]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 0
 // CHECK1-NEXT:    store ptr [[ARGC1]], ptr [[TMP24]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 1
@@ -498,19 +498,19 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    store ptr @.red_comb..4, ptr [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_]], i32 0, i32 6
 // CHECK1-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP30]], i8 0, i64 4, i1 false)
-// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1
+// CHECK1-NEXT:    [[DOTRD_INPUT_GEP_7:%.*]] = getelementptr inbounds nuw [2 x %struct.kmp_taskred_input_t.0], ptr [[DOTRD_INPUT_]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds ptr, ptr [[TMP32]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP32]], i64 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX8]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[TMP33]], i64 0
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i64 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
 // CHECK1-NEXT:    [[LB_ADD_LEN10:%.*]] = add nsw i64 -1, [[TMP35]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[ARGV_ADDR]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[TMP36]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP36]], i64 9
 // CHECK1-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP37]], i64 [[LB_ADD_LEN10]]
 // CHECK1-NEXT:    store ptr [[VLA]], ptr [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASKRED_INPUT_T_0]], ptr [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[TMP38]], align 8
@@ -831,9 +831,9 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[LB_ADD_LEN_I:%.*]] = add nsw i64 -1, [[TMP24]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP25]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds ptr, ptr [[TMP26]], i64 9
+// CHECK1-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds nuw ptr, ptr [[TMP26]], i64 9
 // CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX2_I]], align 8
-// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
+// CHECK1-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP27]], i64 [[LB_ADD_LEN_I]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[ARRAYIDX3_I]] to i64
 // CHECK1-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[TMP20]] to i64
 // CHECK1-NEXT:    [[TMP30:%.*]] = sub i64 [[TMP28]], [[TMP29]]
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index 6b0609a26c588..81923617f637e 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -380,11 +380,9 @@ void test() {
   { A k = to_lvalue_ref(A()); } // expected-error {{is not a constant expression}}
   // expected-note@-1 {{is not a constant expression}} expected-note@-1 {{temporary created here}}
   { A k = to_lvalue_ref(A().ret_a()); }
-  // expected-error@-1 {{'alloc::A::ret_a' is not a constant expression}}
-  // expected-note@-2 {{heap-allocated object is not a constant expression}}
-  // expected-error@-3 {{'alloc::to_lvalue_ref' is not a constant expression}}
-  // expected-note@-4 {{reference to temporary is not a constant expression}}
-  // expected-note@-5 {{temporary created here}}
+  // expected-note@-1 {{reference to temporary is not a constant expression}}
+  // expected-error@-2 {{'alloc::to_lvalue_ref' is not a constant expression}}
+  // expected-note@-3 {{temporary created here}}
   { int k = A().ret_a().ret_i(); }
   // expected-error@-1 {{'alloc::A::ret_a' is not a constant expression}}
   // expected-note@-2 {{heap-allocated object is not a constant expression}}
@@ -394,19 +392,13 @@ void test() {
   { int k = rvalue_ref(A()); }
   { int k = rvalue_ref(std::move(a)); }
   { int k = const_a_ref(A().ret_a()); }
-  // expected-error@-1 {{'alloc::A::ret_a' is not a constant expression}}
-  // expected-note@-2 {{is not a constant expression}}
   { int k = const_a_ref(to_lvalue_ref(A().ret_a())); }
-  // expected-error@-1 {{'alloc::A::ret_a' is not a constant expression}}
-  // expected-note@-2 {{is not a constant expression}}
   { int k = const_a_ref(to_lvalue_ref(std::move(a))); }
   { int k = by_value_a(A().ret_a()); }
   { int k = by_value_a(to_lvalue_ref(static_cast<const A&&>(a))); }
   { int k = (A().ret_a(), A().ret_i()); }// expected-error {{is not a constant expression}}
   // expected-note@-1 {{is not a constant expression}}
   { int k = (const_a_ref(A().ret_a()), A().ret_i()); }
-  // expected-error@-1 {{'alloc::A::ret_a' is not a constant expression}}
-  // expected-note@-2 {{is not a constant expression}}
 }
 
 }
@@ -1232,4 +1224,27 @@ consteval void immediate() {
 }
 
 
+}
+
+namespace GH105558 {
+
+consteval int* alloc() { return new int(0); }
+consteval void f(int* p) { delete p; }
+consteval void g1(int*&& p) { delete p; }
+consteval void g2(const int* p) { delete p; }
+consteval void g3(int*const& p) { delete p; }
+struct X {
+  int* p;
+  explicit(false) constexpr X(int* p) : p(p) {}
+};
+consteval void g4(X x) { delete x.p; }
+
+void test() {
+  f(alloc());
+  g1(alloc());
+  g2(alloc());
+  g3(alloc());
+  g4(alloc());
+}
+
 }
diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
index 9ea90a4c3e30f..7d7e808746217 100644
--- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
+++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp
@@ -231,3 +231,31 @@ struct type_info {
 namespace GH93650 {
 auto func(auto... inputArgs) { return typeid(inputArgs...[0]); }
 } // namespace GH93650
+
+
+namespace GH105900 {
+
+template <typename... opts>
+struct types  {
+    template <unsigned idx>
+    static constexpr __SIZE_TYPE__ get_index() { return idx; }
+
+    template <unsigned s>
+    static auto x() -> opts...[get_index<s>()] {}
+};
+
+template <auto... opts>
+struct vars  {
+    template <unsigned idx>
+    static constexpr __SIZE_TYPE__ get_index() { return idx; }
+
+    template <unsigned s>
+    static auto x() -> decltype(opts...[get_index<s>()]) {return 0;}
+};
+
+void f() {
+    types<void>::x<0>();
+    vars<0>::x<0>();
+}
+
+}
diff --git a/clang/test/SemaCXX/no_destroy.cpp b/clang/test/SemaCXX/no_destroy.cpp
index 5872bcf4b439e..d39bcaeff860a 100644
--- a/clang/test/SemaCXX/no_destroy.cpp
+++ b/clang/test/SemaCXX/no_destroy.cpp
@@ -1,31 +1,21 @@
-// RUN: %clang_cc1 -DNO_DTORS -DNO_EXCEPTIONS -fno-c++-static-destructors -verify %s
-// RUN: %clang_cc1 -DNO_EXCEPTIONS -verify %s
-// RUN: %clang_cc1 -DNO_DTORS -fexceptions -fno-c++-static-destructors -verify %s
-// RUN: %clang_cc1 -fexceptions -verify %s
+// RUN: %clang_cc1 -fc++-static-destructors=none -verify %s
+// RUN: %clang_cc1 -fc++-static-destructors=thread-local -verify=expected,thread-local-dtors %s
+// RUN: %clang_cc1 -verify=expected,thread-local-dtors,all-dtors %s
+// RUN: %clang_cc1 -fexceptions -fc++-static-destructors=none -verify %s
+// RUN: %clang_cc1 -fexceptions -fc++-static-destructors=thread-local -verify=expected,thread-local-dtors %s
+// RUN: %clang_cc1 -fexceptions -verify=expected,thread-local-dtors,all-dtors %s
 
 struct SecretDestructor {
-#ifndef NO_DTORS
-  // expected-note@+2 4 {{private}}
-#endif
 private: ~SecretDestructor(); // expected-note + {{private}}
 };
 
-SecretDestructor sd1;
-thread_local SecretDestructor sd2;
+SecretDestructor sd1; // all-dtors-error{{private}}
+thread_local SecretDestructor sd2; // thread-local-dtors-error{{private}}
 void locals() {
-  static SecretDestructor sd3;
-  thread_local SecretDestructor sd4;
+  static SecretDestructor sd3; // all-dtors-error{{private}}
+  thread_local SecretDestructor sd4; // thread-local-dtors-error{{private}}
 }
 
-#ifndef NO_DTORS
-// SecretDestructor sd1;                  // expected-error@-8 {{private}}
-// thread_local SecretDestructor sd2;     // expected-error@-8 {{private}}
-// void locals() {
-//   static SecretDestructor sd3;         // expected-error@-8 {{private}}
-//   thread_local SecretDestructor sd4;   // expected-error@-8 {{private}}
-// }
-#endif
-
 [[clang::always_destroy]] SecretDestructor sd6; // expected-error{{private}}
 [[clang::always_destroy]] thread_local SecretDestructor sd7; // expected-error{{private}}
 
diff --git a/clang/test/SemaHLSL/Types/Traits/ScalarizedLayoutCompatible.hlsl b/clang/test/SemaHLSL/Types/Traits/ScalarizedLayoutCompatible.hlsl
new file mode 100644
index 0000000000000..db46a8e141495
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/Traits/ScalarizedLayoutCompatible.hlsl
@@ -0,0 +1,132 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s
+// expected-no-diagnostics
+
+// Case 1: How many ways can I come up with to represent three float values?
+struct ThreeFloats1 {
+  float X, Y, Z;
+};
+
+struct ThreeFloats2 {
+  float X[3];
+};
+
+struct ThreeFloats3 {
+  float3 V;
+};
+
+struct ThreeFloats4 {
+  float2 V;
+  float F;
+};
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(float3, float[3]), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(float3, ThreeFloats1), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(float3, ThreeFloats2), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(float3, ThreeFloats3), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(float3, ThreeFloats4), "");
+
+// Case 2: structs and base classes and arrays, oh my!
+struct Dog {
+  int Leg[4];
+  bool Tail;
+  float Fur;
+};
+
+struct Shiba {
+  int4 StubbyLegs;
+  bool CurlyTail;
+  struct Coating {
+    float Fur;
+  } F;
+};
+
+struct FourLegged {
+  int FR, FL, BR, BL;
+};
+
+struct Doggo : FourLegged {
+  bool WaggyBit;
+  float Fuzz;
+};
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Dog, Shiba), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Dog, Doggo), "");
+
+// Case 3: Arrays of structs inside structs
+
+struct Cat {
+  struct Leg {
+    int L;
+  } Legs[4];
+  struct Other {
+    bool Tail;
+    float Furs;
+  } Bits;
+};
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Dog, Cat), "");
+
+// case 4: Arrays of structs inside arrays of structs.
+struct Pets {
+  Dog Puppers[6];
+  Cat Kitties[4];
+};
+
+struct Animals {
+  Dog Puppers[2];
+  Cat Kitties[8];
+};
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Pets, Animals), "");
+
+// Case 5: Turtles all the way down...
+
+typedef int Turtle;
+
+enum Ninja : Turtle {
+  Leonardo,
+  Donatello,
+  Michelangelo,
+  Raphael,
+};
+
+enum NotNinja : Turtle {
+  Fred,
+  Mikey,
+};
+
+enum Mammals : uint {
+  Dog,
+  Cat,
+};
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Ninja, NotNinja), "");
+_Static_assert(!__builtin_hlsl_is_scalarized_layout_compatible(Ninja, Mammals), "");
+
+// Case 6: Some basic types.
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(int, int32_t), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(uint, uint32_t), "");
+_Static_assert(!__builtin_hlsl_is_scalarized_layout_compatible(int, uint), "");
+_Static_assert(!__builtin_hlsl_is_scalarized_layout_compatible(int, float), "");
+
+// Even though half and float may be the same size we don't want them to be
+// layout compatible since they are different types.
+_Static_assert(!__builtin_hlsl_is_scalarized_layout_compatible(half, float), "");
+
+// Case 6: Empty classes... because they're fun.
+
+struct NotEmpty { int X; };
+struct Empty {};
+struct AlsoEmpty {};
+
+struct DerivedEmpty : Empty {};
+
+struct DerivedNotEmpty : Empty { int X; };
+struct DerivedEmptyNotEmptyBase : NotEmpty {};
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Empty, AlsoEmpty), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Empty, DerivedEmpty), "");
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(NotEmpty, DerivedNotEmpty), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(NotEmpty, DerivedEmptyNotEmptyBase), "");
diff --git a/clang/test/SemaHLSL/Types/Traits/ScalarizedLayoutCompatibleErrors.hlsl b/clang/test/SemaHLSL/Types/Traits/ScalarizedLayoutCompatibleErrors.hlsl
new file mode 100644
index 0000000000000..4c96795da7fd0
--- /dev/null
+++ b/clang/test/SemaHLSL/Types/Traits/ScalarizedLayoutCompatibleErrors.hlsl
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library  -finclude-default-header -verify %s
+
+// Some things that don't work!
+
+// Case 1: Both types must be complete!
+struct Defined {
+  int X;
+};
+
+
+struct Undefined; // expected-note {{forward declaration of 'Undefined'}}
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Undefined, Defined), ""); // expected-error{{incomplete type 'Undefined' where a complete type is required}}
+
+// Case 2: No variable length arrays!
+
+void fn(int X) {
+  // expected-error@#vla {{variable length arrays are not supported for the current target}}
+  // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_scalarized_layout_compatible'}}
+  // expected-error@#vla {{static assertion failed due to requirement '__builtin_hlsl_is_scalarized_layout_compatible(int[4], int[X])'}}
+  // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}}
+  _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(int[4], int[X]), ""); // #vla
+}
+
+// Case 3: Make this always fail for unions.
+// HLSL doesn't really support unions, and the places where scalarized layouts
+// are valid is probably going to be really confusing for unions, so we should
+// just make sure unions are never scalarized compatible with anything other
+// than themselves.
+
+union Wah {
+  int OhNo;
+  float NotAgain;
+};
+
+struct OneInt {
+  int I;
+};
+
+struct OneFloat {
+  float F;
+};
+
+struct HasUnion {
+  int I;
+  Wah W;
+};
+
+struct HasUnionSame {
+  int I;
+  Wah W;
+};
+
+struct HasUnionDifferent {
+  Wah W;
+  int I;
+};
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(Wah, Wah), "Identical types are always compatible");
+_Static_assert(!__builtin_hlsl_is_scalarized_layout_compatible(Wah, OneInt), "Unions are not compatible with anything else");
+_Static_assert(!__builtin_hlsl_is_scalarized_layout_compatible(Wah, OneFloat), "Unions are not compatible with anything else");
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(HasUnion, HasUnionSame), "");
+_Static_assert(!__builtin_hlsl_is_scalarized_layout_compatible(HasUnion, HasUnionDifferent), "");
diff --git a/clang/test/SemaTemplate/alias-template-with-lambdas.cpp b/clang/test/SemaTemplate/alias-template-with-lambdas.cpp
index ff94031e4d86f..5ec93163e4d18 100644
--- a/clang/test/SemaTemplate/alias-template-with-lambdas.cpp
+++ b/clang/test/SemaTemplate/alias-template-with-lambdas.cpp
@@ -91,15 +91,84 @@ void bar() {
 
 namespace GH82104 {
 
-template <typename, typename...> int Zero = 0;
+template <typename, typename... D> constexpr int Value = sizeof...(D);
 
-template <typename T, typename...U>
-using T14 = decltype([]<int V = 0>() { return Zero<T, U...>; }());
+template <typename T, typename... U>
+using T14 = decltype([]<int V = 0>(auto Param) {
+  return Value<T, U...> + V + (int)sizeof(Param);
+}("hello"));
 
 template <typename T> using T15 = T14<T, T>;
 
 static_assert(__is_same(T15<char>, int));
 
+// FIXME: This still crashes because we can't extract template arguments T and U
+// outside of the instantiation context of T16.
+#if 0
+template <typename T, typename... U>
+using T16 = decltype([](auto Param) requires (sizeof(Param) != 1 && sizeof...(U) > 0) {
+  return Value<T, U...> + sizeof(Param);
+});
+static_assert(T16<int, char, float>()(42) == 2 + sizeof(42));
+#endif
 } // namespace GH82104
 
+namespace GH89853 {
+
+template <typename = void>
+static constexpr auto innocuous = []<int m> { return m; };
+
+template <auto Pred = innocuous<>>
+using broken = decltype(Pred.template operator()<42>());
+
+broken<> *boom;
+
+template <auto Pred =
+              []<char c> {
+                (void)static_cast<char>(c);
+              }>
+using broken2 = decltype(Pred.template operator()<42>());
+
+broken2<> *boom2;
+
+template <auto Pred = []<char m> { return m; }>
+using broken3 = decltype(Pred.template operator()<42>());
+
+broken3<> *boom3;
+
+static constexpr auto non_default = []<char c>(True auto) {
+    (void) static_cast<char>(c);
+};
+
+template<True auto Pred>
+using broken4 = decltype(Pred.template operator()<42>(Pred));
+
+broken4<non_default>* boom4;
+
+} // namespace GH89853
+
+namespace GH105885 {
+
+template<int>
+using test = decltype([](auto...) {
+}());
+
+static_assert(__is_same(test<0>, void));
+
+} // namespace GH105885
+
+namespace GH102760 {
+
+auto make_tuple = []< class Tag, class... Captures>(Tag, Captures...) {
+  return []< class _Fun >( _Fun) -> void requires requires { 0; }
+  {};
+};
+
+template < class, class... _As >
+using Result = decltype(make_tuple(0)(_As{}...));
+
+using T = Result<int, int>;
+
+} // namespace GH102760
+
 } // namespace lambda_calls
diff --git a/clang/tools/clang-format/clang-format-diff.py b/clang/tools/clang-format/clang-format-diff.py
index 9eec0f3c89de3..aebe193eadb34 100755
--- a/clang/tools/clang-format/clang-format-diff.py
+++ b/clang/tools/clang-format/clang-format-diff.py
@@ -168,7 +168,7 @@ def main():
                 'Failed to run "%s" - %s"' % (" ".join(command), e.strerror)
             )
 
-        stdout, stderr = p.communicate()
+        stdout, _stderr = p.communicate()
         if p.returncode != 0:
             return p.returncode
 
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
index e451ea6b03622..1b1655f94eaa5 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
@@ -161,6 +161,37 @@ INTERCEPTOR(int, puts, const char *s) {
   return REAL(puts)(s);
 }
 
+INTERCEPTOR(ssize_t, read, int fd, void *buf, size_t count) {
+  ExpectNotRealtime("read");
+  return REAL(read)(fd, buf, count);
+}
+
+INTERCEPTOR(ssize_t, write, int fd, const void *buf, size_t count) {
+  ExpectNotRealtime("write");
+  return REAL(write)(fd, buf, count);
+}
+
+INTERCEPTOR(ssize_t, pread, int fd, void *buf, size_t count, off_t offset) {
+  ExpectNotRealtime("pread");
+  return REAL(pread)(fd, buf, count, offset);
+}
+
+INTERCEPTOR(ssize_t, readv, int fd, const struct iovec *iov, int iovcnt) {
+  ExpectNotRealtime("readv");
+  return REAL(readv)(fd, iov, iovcnt);
+}
+
+INTERCEPTOR(ssize_t, pwrite, int fd, const void *buf, size_t count,
+            off_t offset) {
+  ExpectNotRealtime("pwrite");
+  return REAL(pwrite)(fd, buf, count, offset);
+}
+
+INTERCEPTOR(ssize_t, writev, int fd, const struct iovec *iov, int iovcnt) {
+  ExpectNotRealtime("writev");
+  return REAL(writev)(fd, iov, iovcnt);
+}
+
 // Concurrency
 #if SANITIZER_APPLE
 #pragma clang diagnostic push
@@ -400,6 +431,12 @@ void __rtsan::InitializeInterceptors() {
   INTERCEPT_FUNCTION(close);
   INTERCEPT_FUNCTION(fopen);
   INTERCEPT_FUNCTION(fread);
+  INTERCEPT_FUNCTION(read);
+  INTERCEPT_FUNCTION(write);
+  INTERCEPT_FUNCTION(pread);
+  INTERCEPT_FUNCTION(readv);
+  INTERCEPT_FUNCTION(pwrite);
+  INTERCEPT_FUNCTION(writev);
   INTERCEPT_FUNCTION(fwrite);
   INTERCEPT_FUNCTION(fclose);
   INTERCEPT_FUNCTION(fcntl);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
index 5b88cf6461294..f91d694dd1e05 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp
@@ -18,6 +18,8 @@
 #if SANITIZER_APPLE
 #include <libkern/OSAtomic.h>
 #include <os/lock.h>
+#include <sys/types.h>
+#include <unistd.h>
 #endif
 
 #if SANITIZER_INTERCEPT_MEMALIGN || SANITIZER_INTERCEPT_PVALLOC
@@ -33,6 +35,7 @@
 #include <pthread.h>
 #include <stdio.h>
 #include <sys/socket.h>
+#include <sys/uio.h>
 
 using namespace testing;
 using namespace rtsan_testing;
@@ -71,6 +74,12 @@ TEST(TestRtsanInterceptors, MallocDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
+TEST(TestRtsanInterceptors, CallocDiesWhenRealtime) {
+  auto Func = []() { EXPECT_NE(nullptr, calloc(2, 4)); };
+  ExpectRealtimeDeath(Func, "calloc");
+  ExpectNonRealtimeSurvival(Func);
+}
+
 TEST(TestRtsanInterceptors, ReallocDiesWhenRealtime) {
   void *ptr_1 = malloc(1);
   auto Func = [ptr_1]() { EXPECT_NE(nullptr, realloc(ptr_1, 8)); };
@@ -274,23 +283,43 @@ TEST_F(RtsanFileTest, FopenDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(func);
 }
 
-TEST_F(RtsanFileTest, FreadDiesWhenRealtime) {
-  auto fd = fopen(GetTemporaryFilePath(), "w");
-  auto func = [fd]() {
+class RtsanOpenedFileTest : public RtsanFileTest {
+protected:
+  void SetUp() override {
+    RtsanFileTest::SetUp();
+    file = fopen(GetTemporaryFilePath(), "w");
+    ASSERT_THAT(file, Ne(nullptr));
+    fd = fileno(file);
+    ASSERT_THAT(fd, Ne(-1));
+  }
+
+  void TearDown() override {
+    if (file != nullptr)
+      fclose(file);
+    RtsanFileTest::TearDown();
+  }
+
+  FILE *GetOpenFile() { return file; }
+
+  int GetOpenFd() { return fd; }
+
+private:
+  FILE *file = nullptr;
+  int fd = -1;
+};
+
+TEST_F(RtsanOpenedFileTest, FreadDiesWhenRealtime) {
+  auto func = [this]() {
     char c{};
-    fread(&c, 1, 1, fd);
+    fread(&c, 1, 1, GetOpenFile());
   };
   ExpectRealtimeDeath(func, "fread");
   ExpectNonRealtimeSurvival(func);
-  if (fd != nullptr)
-    fclose(fd);
 }
 
-TEST_F(RtsanFileTest, FwriteDiesWhenRealtime) {
-  auto fd = fopen(GetTemporaryFilePath(), "w");
-  ASSERT_NE(nullptr, fd);
-  auto message = "Hello, world!";
-  auto func = [&]() { fwrite(&message, 1, 4, fd); };
+TEST_F(RtsanOpenedFileTest, FwriteDiesWhenRealtime) {
+  const char *message = "Hello, world!";
+  auto func = [&]() { fwrite(&message, 1, 4, GetOpenFile()); };
   ExpectRealtimeDeath(func, "fwrite");
   ExpectNonRealtimeSurvival(func);
 }
@@ -309,14 +338,66 @@ TEST(TestRtsanInterceptors, PutsDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(func);
 }
 
-TEST_F(RtsanFileTest, FputsDiesWhenRealtime) {
-  auto fd = fopen(GetTemporaryFilePath(), "w");
-  ASSERT_THAT(fd, Ne(nullptr)) << errno;
-  auto func = [fd]() { fputs("Hello, world!\n", fd); };
+TEST_F(RtsanOpenedFileTest, FputsDiesWhenRealtime) {
+  auto func = [this]() { fputs("Hello, world!\n", GetOpenFile()); };
   ExpectRealtimeDeath(func);
   ExpectNonRealtimeSurvival(func);
-  if (fd != nullptr)
-    fclose(fd);
+}
+
+TEST_F(RtsanOpenedFileTest, ReadDiesWhenRealtime) {
+  auto Func = [this]() {
+    char c{};
+    read(GetOpenFd(), &c, 1);
+  };
+  ExpectRealtimeDeath(Func, "read");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, WriteDiesWhenRealtime) {
+  auto Func = [this]() {
+    char c = 'a';
+    write(GetOpenFd(), &c, 1);
+  };
+  ExpectRealtimeDeath(Func, "write");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, PreadDiesWhenRealtime) {
+  auto Func = [this]() {
+    char c{};
+    pread(GetOpenFd(), &c, 1, 0);
+  };
+  ExpectRealtimeDeath(Func, "pread");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, ReadvDiesWhenRealtime) {
+  auto Func = [this]() {
+    char c{};
+    iovec iov{&c, 1};
+    readv(GetOpenFd(), &iov, 1);
+  };
+  ExpectRealtimeDeath(Func, "readv");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, PwriteDiesWhenRealtime) {
+  auto Func = [this]() {
+    char c = 'a';
+    pwrite(GetOpenFd(), &c, 1, 0);
+  };
+  ExpectRealtimeDeath(Func, "pwrite");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanOpenedFileTest, WritevDiesWhenRealtime) {
+  auto Func = [this]() {
+    char c = 'a';
+    iovec iov{&c, 1};
+    writev(GetOpenFd(), &iov, 1);
+  };
+  ExpectRealtimeDeath(Func, "writev");
+  ExpectNonRealtimeSurvival(Func);
 }
 
 /*
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 27f8697db7838..fba21c3cb6a09 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -72,6 +72,15 @@ namespace {
 struct CachedBlock {
   static constexpr u16 CacheIndexMax = UINT16_MAX;
   static constexpr u16 InvalidEntry = CacheIndexMax;
+  //   * MaxReleasedCachePages default is currently 4
+  //        - We arrived at this value after noticing that mapping
+  //        in larger memory regions performs better than releasing
+  //        memory and forcing a cache hit. According to the data,
+  //        it suggests that beyond 4 pages, the release execution time is
+  //        longer than the map execution time. In this way, the default
+  //        is dependent on the platform.
+  //    TODO: set MaxReleasedCachePages back to 4U
+  static constexpr uptr MaxReleasedCachePages = 0U;
 
   uptr CommitBase = 0;
   uptr CommitSize = 0;
@@ -90,8 +99,9 @@ struct CachedBlock {
 template <typename Config> class MapAllocatorNoCache {
 public:
   void init(UNUSED s32 ReleaseToOsInterval) {}
-  CachedBlock retrieve(UNUSED uptr Size, UNUSED uptr Alignment,
-                       UNUSED uptr HeadersSize, UNUSED uptr &EntryHeaderPos) {
+  CachedBlock retrieve(UNUSED uptr MaxAllowedFragmentedBytes, UNUSED uptr Size,
+                       UNUSED uptr Alignment, UNUSED uptr HeadersSize,
+                       UNUSED uptr &EntryHeaderPos) {
     return {};
   }
   void store(UNUSED Options Options, UNUSED uptr CommitBase,
@@ -121,7 +131,7 @@ template <typename Config> class MapAllocatorNoCache {
   }
 };
 
-static const uptr MaxUnusedCachePages = 4U;
+static const uptr MaxUnreleasedCachePages = 4U;
 
 template <typename Config>
 bool mapSecondary(const Options &Options, uptr CommitBase, uptr CommitSize,
@@ -151,9 +161,11 @@ bool mapSecondary(const Options &Options, uptr CommitBase, uptr CommitSize,
     }
   }
 
-  const uptr MaxUnusedCacheBytes = MaxUnusedCachePages * PageSize;
-  if (useMemoryTagging<Config>(Options) && CommitSize > MaxUnusedCacheBytes) {
-    const uptr UntaggedPos = Max(AllocPos, CommitBase + MaxUnusedCacheBytes);
+  const uptr MaxUnreleasedCacheBytes = MaxUnreleasedCachePages * PageSize;
+  if (useMemoryTagging<Config>(Options) &&
+      CommitSize > MaxUnreleasedCacheBytes) {
+    const uptr UntaggedPos =
+        Max(AllocPos, CommitBase + MaxUnreleasedCacheBytes);
     return MemMap.remap(CommitBase, UntaggedPos - CommitBase, "scudo:secondary",
                         MAP_MEMTAG | Flags) &&
            MemMap.remap(UntaggedPos, CommitBase + CommitSize - UntaggedPos,
@@ -334,13 +346,13 @@ class MapAllocatorCache {
     }
   }
 
-  CachedBlock retrieve(uptr Size, uptr Alignment, uptr HeadersSize,
-                       uptr &EntryHeaderPos) EXCLUDES(Mutex) {
+  CachedBlock retrieve(uptr MaxAllowedFragmentedPages, uptr Size,
+                       uptr Alignment, uptr HeadersSize, uptr &EntryHeaderPos)
+      EXCLUDES(Mutex) {
     const uptr PageSize = getPageSizeCached();
     // 10% of the requested size proved to be the optimal choice for
     // retrieving cached blocks after testing several options.
     constexpr u32 FragmentedBytesDivisor = 10;
-    bool Found = false;
     CachedBlock Entry;
     EntryHeaderPos = 0;
     {
@@ -348,47 +360,100 @@ class MapAllocatorCache {
       CallsToRetrieve++;
       if (EntriesCount == 0)
         return {};
-      u32 OptimalFitIndex = 0;
+      u16 RetrievedIndex = CachedBlock::InvalidEntry;
       uptr MinDiff = UINTPTR_MAX;
-      for (u32 I = LRUHead; I != CachedBlock::InvalidEntry;
+
+      //  Since allocation sizes don't always match cached memory chunk sizes
+      //  we allow some memory to be unused (called fragmented bytes). The
+      //  amount of unused bytes is exactly EntryHeaderPos - CommitBase.
+      //
+      //        CommitBase                CommitBase + CommitSize
+      //          V                              V
+      //      +---+------------+-----------------+---+
+      //      |   |            |                 |   |
+      //      +---+------------+-----------------+---+
+      //      ^                ^                     ^
+      //    Guard         EntryHeaderPos          Guard-page-end
+      //    page-begin
+      //
+      //  [EntryHeaderPos, CommitBase + CommitSize) contains the user data as
+      //  well as the header metadata. If EntryHeaderPos - CommitBase exceeds
+      //  MaxAllowedFragmentedPages * PageSize, the cached memory chunk is
+      //  not considered valid for retrieval.
+      for (u16 I = LRUHead; I != CachedBlock::InvalidEntry;
            I = Entries[I].Next) {
         const uptr CommitBase = Entries[I].CommitBase;
         const uptr CommitSize = Entries[I].CommitSize;
         const uptr AllocPos =
             roundDown(CommitBase + CommitSize - Size, Alignment);
         const uptr HeaderPos = AllocPos - HeadersSize;
+        const uptr MaxAllowedFragmentedBytes =
+            MaxAllowedFragmentedPages * PageSize;
         if (HeaderPos > CommitBase + CommitSize)
           continue;
+        // TODO: Remove AllocPos > CommitBase + MaxAllowedFragmentedBytes
+        // and replace with Diff > MaxAllowedFragmentedBytes
         if (HeaderPos < CommitBase ||
-            AllocPos > CommitBase + PageSize * MaxUnusedCachePages) {
+            AllocPos > CommitBase + MaxAllowedFragmentedBytes) {
           continue;
         }
-        Found = true;
-        const uptr Diff = HeaderPos - CommitBase;
-        // immediately use a cached block if it's size is close enough to the
-        // requested size.
-        const uptr MaxAllowedFragmentedBytes =
-            (CommitBase + CommitSize - HeaderPos) / FragmentedBytesDivisor;
-        if (Diff <= MaxAllowedFragmentedBytes) {
-          OptimalFitIndex = I;
-          EntryHeaderPos = HeaderPos;
-          break;
-        }
-        // keep track of the smallest cached block
+
+        const uptr Diff = roundDown(HeaderPos, PageSize) - CommitBase;
+
+        // Keep track of the smallest cached block
         // that is greater than (AllocSize + HeaderSize)
-        if (Diff > MinDiff)
+        if (Diff >= MinDiff)
           continue;
-        OptimalFitIndex = I;
+
         MinDiff = Diff;
+        RetrievedIndex = I;
         EntryHeaderPos = HeaderPos;
+
+        // Immediately use a cached block if its size is close enough to the
+        // requested size
+        const uptr OptimalFitThesholdBytes =
+            (CommitBase + CommitSize - HeaderPos) / FragmentedBytesDivisor;
+        if (Diff <= OptimalFitThesholdBytes)
+          break;
       }
-      if (Found) {
-        Entry = Entries[OptimalFitIndex];
-        remove(OptimalFitIndex);
+      if (RetrievedIndex != CachedBlock::InvalidEntry) {
+        Entry = Entries[RetrievedIndex];
+        remove(RetrievedIndex);
         SuccessfulRetrieves++;
       }
     }
 
+    //  The difference between the retrieved memory chunk and the request
+    //  size is at most MaxAllowedFragmentedPages
+    //
+    // +- MaxAllowedFragmentedPages * PageSize -+
+    // +--------------------------+-------------+
+    // |                          |             |
+    // +--------------------------+-------------+
+    //  \ Bytes to be released   /        ^
+    //                                    |
+    //                           (may or may not be committed)
+    //
+    //   The maximum number of bytes released to the OS is capped by
+    //   MaxReleasedCachePages
+    //
+    //   TODO : Consider making MaxReleasedCachePages configurable since
+    //   the release to OS API can vary across systems.
+    if (Entry.Time != 0) {
+      const uptr FragmentedBytes =
+          roundDown(EntryHeaderPos, PageSize) - Entry.CommitBase;
+      const uptr MaxUnreleasedCacheBytes = MaxUnreleasedCachePages * PageSize;
+      if (FragmentedBytes > MaxUnreleasedCacheBytes) {
+        const uptr MaxReleasedCacheBytes =
+            CachedBlock::MaxReleasedCachePages * PageSize;
+        uptr BytesToRelease =
+            roundUp(Min<uptr>(MaxReleasedCacheBytes,
+                              FragmentedBytes - MaxUnreleasedCacheBytes),
+                    PageSize);
+        Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase, BytesToRelease);
+      }
+    }
+
     return Entry;
   }
 
@@ -659,8 +724,13 @@ MapAllocator<Config>::tryAllocateFromCache(const Options &Options, uptr Size,
                                            FillContentsMode FillContents) {
   CachedBlock Entry;
   uptr EntryHeaderPos;
+  uptr MaxAllowedFragmentedPages = MaxUnreleasedCachePages;
+
+  if (UNLIKELY(useMemoryTagging<Config>(Options)))
+    MaxAllowedFragmentedPages += CachedBlock::MaxReleasedCachePages;
 
-  Entry = Cache.retrieve(Size, Alignment, getHeadersSize(), EntryHeaderPos);
+  Entry = Cache.retrieve(MaxAllowedFragmentedPages, Size, Alignment,
+                         getHeadersSize(), EntryHeaderPos);
   if (!Entry.isValid())
     return nullptr;
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index e85b6abdb36d2..3638f1c36ddd9 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -281,8 +281,8 @@ struct MapAllocatorCacheTest : public Test {
   std::unique_ptr<CacheT> Cache = std::make_unique<CacheT>();
 
   const scudo::uptr PageSize = scudo::getPageSizeCached();
-  // The current test allocation size is set to the minimum size
-  // needed for the scudo allocator to fall back to the secondary allocator
+  // The current test allocation size is set to the maximum
+  // cache entry size
   static constexpr scudo::uptr TestAllocSize =
       CacheConfig::getDefaultMaxEntrySize();
 
@@ -327,7 +327,7 @@ TEST_F(MapAllocatorCacheTest, CacheOrder) {
   for (scudo::uptr I = CacheConfig::getEntriesArraySize(); I > 0; I--) {
     scudo::uptr EntryHeaderPos;
     scudo::CachedBlock Entry =
-        Cache->retrieve(TestAllocSize, PageSize, 0, EntryHeaderPos);
+        Cache->retrieve(0, TestAllocSize, PageSize, 0, EntryHeaderPos);
     EXPECT_EQ(Entry.MemMap.getBase(), MemMaps[I - 1].getBase());
   }
 
@@ -336,6 +336,30 @@ TEST_F(MapAllocatorCacheTest, CacheOrder) {
     MemMap.unmap();
 }
 
+TEST_F(MapAllocatorCacheTest, PartialChunkHeuristicRetrievalTest) {
+  const scudo::uptr FragmentedPages =
+      1 + scudo::CachedBlock::MaxReleasedCachePages;
+  scudo::uptr EntryHeaderPos;
+  scudo::CachedBlock Entry;
+  scudo::MemMapT MemMap = allocate(PageSize + FragmentedPages * PageSize);
+  Cache->store(Options, MemMap.getBase(), MemMap.getCapacity(),
+               MemMap.getBase(), MemMap);
+
+  // FragmentedPages > MaxAllowedFragmentedPages so PageSize
+  // cannot be retrieved from the cache
+  Entry = Cache->retrieve(/*MaxAllowedFragmentedPages=*/0, PageSize, PageSize,
+                          0, EntryHeaderPos);
+  EXPECT_FALSE(Entry.isValid());
+
+  // FragmentedPages == MaxAllowedFragmentedPages so PageSize
+  // can be retrieved from the cache
+  Entry =
+      Cache->retrieve(FragmentedPages, PageSize, PageSize, 0, EntryHeaderPos);
+  EXPECT_TRUE(Entry.isValid());
+
+  MemMap.unmap();
+}
+
 TEST_F(MapAllocatorCacheTest, MemoryLeakTest) {
   std::vector<scudo::MemMapT> MemMaps;
   // Fill the cache above MaxEntriesCount to force an eviction
@@ -351,7 +375,7 @@ TEST_F(MapAllocatorCacheTest, MemoryLeakTest) {
   for (scudo::uptr I = CacheConfig::getDefaultMaxEntriesCount(); I > 0; I--) {
     scudo::uptr EntryHeaderPos;
     RetrievedEntries.push_back(
-        Cache->retrieve(TestAllocSize, PageSize, 0, EntryHeaderPos));
+        Cache->retrieve(0, TestAllocSize, PageSize, 0, EntryHeaderPos));
     EXPECT_EQ(MemMaps[I].getBase(), RetrievedEntries.back().MemMap.getBase());
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp b/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
index 23f0a02ea4277..a762aee48f7c6 100644
--- a/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/timing_test.cpp
@@ -154,7 +154,7 @@ TEST_F(ScudoTimingTest, VerifyMax) {
   unsigned long long MaxNs = std::strtoull(&end[6], &end, 10);
   ASSERT_TRUE(end != nullptr);
 
-  EXPECT_GT(MaxNs, AvgNs);
+  EXPECT_GE(MaxNs, AvgNs);
 }
 
 TEST_F(ScudoTimingTest, VerifyMultipleTimerCalls) {
diff --git a/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp b/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp
index b096624a7f95b..f60a6a4ef79e6 100644
--- a/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp
+++ b/compiler-rt/test/asan/TestCases/Linux/allocator_oom_test.cpp
@@ -31,7 +31,7 @@
 // ASan shadow memory on s390 is too large for this test.
 // AArch64 bots fail on this test.
 // TODO(alekseys): Android lit do not run ulimit on device.
-// REQUIRES: shadow-scale-3
+// REQUIRES: shell, shadow-scale-3
 // UNSUPPORTED: android, target={{(s390|aarch64|powerpc64le).*}}
 
 #include <stdlib.h>
diff --git a/compiler-rt/test/asan/TestCases/Posix/deep_call_stack.cpp b/compiler-rt/test/asan/TestCases/Posix/deep_call_stack.cpp
index e6e82a4757205..37aa7b11a231a 100644
--- a/compiler-rt/test/asan/TestCases/Posix/deep_call_stack.cpp
+++ b/compiler-rt/test/asan/TestCases/Posix/deep_call_stack.cpp
@@ -1,4 +1,5 @@
 // Check that UAR mode can handle very deep recusrion.
+// REQUIRES: shell
 // RUN: %clangxx_asan -O2 %s -o %t
 // RUN: ulimit -s 4096
 // RUN: %env_asan_opts=detect_stack_use_after_return=1 %run %t 2>&1 | FileCheck %s
diff --git a/compiler-rt/test/fuzzer/focus-function.test b/compiler-rt/test/fuzzer/focus-function.test
index ec4a03c95a635..64fd5eebb2389 100644
--- a/compiler-rt/test/fuzzer/focus-function.test
+++ b/compiler-rt/test/fuzzer/focus-function.test
@@ -1,7 +1,8 @@
 # Tests -focus_function
 #
 # TODO: don't require linux.
-# REQUIRES: linux
+# Requires full shell support for the `for` loop syntax.
+# REQUIRES: shell, linux
 UNSUPPORTED: target=aarch64{{.*}}
 
 RUN: %cpp_compiler %S/OnlySomeBytesTest.cpp -o %t-exe
diff --git a/compiler-rt/test/fuzzer/merge-posix.test b/compiler-rt/test/fuzzer/merge-posix.test
index 9fece647ca60b..2721668fb9706 100644
--- a/compiler-rt/test/fuzzer/merge-posix.test
+++ b/compiler-rt/test/fuzzer/merge-posix.test
@@ -1,5 +1,5 @@
+REQUIRES: shell
 XFAIL: ios
-UNSUPPORTED: target={{.*windows.*}}
 RUN: %cpp_compiler %S/FullCoverageSetTest.cpp -o %t-FullCoverageSetTest
 
 RUN: rm -rf  %tmp/T1 %tmp/T2
diff --git a/compiler-rt/test/fuzzer/out-of-process-fuzz.test b/compiler-rt/test/fuzzer/out-of-process-fuzz.test
index 4bd866061f1fc..d239bfac1b9cb 100644
--- a/compiler-rt/test/fuzzer/out-of-process-fuzz.test
+++ b/compiler-rt/test/fuzzer/out-of-process-fuzz.test
@@ -14,8 +14,8 @@ RUN: echo %t
 # Out-of-process fuzzing with this rig is slow,
 # we can not wait for the fuzzer to find the faulty input.
 # Just run for a bit and observe the corpus expansion.
-RUN: LIBFUZZER_OOP_TARGET="./oop-target > /dev/null 2>&1 " ./oop-fuzzer -max_len=3 OOP_CORPUS -runs=1000 -jobs=4
-CHECK: Running: OOP_CORPUS/
-CHECK: Running: OOP_CORPUS/
-CHECK: Running: OOP_CORPUS/
+RUN: env LIBFUZZER_OOP_TARGET="./oop-target > /dev/null 2>&1 " ./oop-fuzzer -max_len=3 OOP_CORPUS -runs=1000 -jobs=4
+CHECK: Running: {{.*}}OOP_CORPUS/
+CHECK: Running: {{.*}}OOP_CORPUS/
+CHECK: Running: {{.*}}OOP_CORPUS/
 RUN: ./oop-target OOP_CORPUS/* 2>&1 | FileCheck %s
diff --git a/compiler-rt/test/fuzzer/ulimit.test b/compiler-rt/test/fuzzer/ulimit.test
index 223f2ac9bb6e2..e330a97cc07c5 100644
--- a/compiler-rt/test/fuzzer/ulimit.test
+++ b/compiler-rt/test/fuzzer/ulimit.test
@@ -1,5 +1,4 @@
-# FIXME: Disabled on Windows for now because Windows has no ulimit command.
-UNSUPPORTED: target={{.*windows.*}}
+REQUIRES: shell
 RUN: %cpp_compiler %S/SimpleTest.cpp -o %t-SimpleTest
 RUN: ulimit -s 1000
 RUN: not %run %t-SimpleTest
diff --git a/compiler-rt/test/hwasan/TestCases/hwasan_symbolize.cpp b/compiler-rt/test/hwasan/TestCases/hwasan_symbolize.cpp
index 6e28e19122317..41dd0902b794c 100644
--- a/compiler-rt/test/hwasan/TestCases/hwasan_symbolize.cpp
+++ b/compiler-rt/test/hwasan/TestCases/hwasan_symbolize.cpp
@@ -1,8 +1,9 @@
-// RUN: %clang_hwasan -Wl,--build-id -g %s -o %t
-// RUN: echo '[{"prefix": "'"$(realpath $(dirname %s))"'/", "link": "http://test.invalid/{file}:{line}"}]' > %t.linkify
-// RUN: %env_hwasan_opts=symbolize=0 not %run %t 2>&1 | hwasan_symbolize --html --symbols $(dirname %t) --index | FileCheck %s
-// RUN: %env_hwasan_opts=symbolize=0 not %run %t 2>&1 | hwasan_symbolize --html --linkify %t.linkify --symbols $(dirname %t) --index | FileCheck --check-prefixes=CHECK,LINKIFY %s
-// RUN: %env_hwasan_opts=symbolize=0 not %run %t 2>&1 | hwasan_symbolize --symbols $(dirname %t) --index | FileCheck %s
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang_hwasan -Wl,--build-id -g %s -o %t/hwasan_symbolize_test
+// RUN: echo '[{"prefix": "'"%S"'/", "link": "http://test.invalid/{file}:{line}"}]' > %t/hwasan_symbolize_test.linkify
+// RUN: %env_hwasan_opts=symbolize=0 not %run %t/hwasan_symbolize_test 2>&1 | hwasan_symbolize --html --symbols %t --index | FileCheck %s
+// RUN: %env_hwasan_opts=symbolize=0 not %run %t/hwasan_symbolize_test 2>&1 | hwasan_symbolize --html --linkify %t/hwasan_symbolize_test.linkify --symbols %t --index | FileCheck --check-prefixes=CHECK,LINKIFY %s
+// RUN: %env_hwasan_opts=symbolize=0 not %run %t/hwasan_symbolize_test 2>&1 | hwasan_symbolize --symbols %t --index | FileCheck %s
 
 #include <sanitizer/hwasan_interface.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/hwasan/TestCases/print-memory-usage.c b/compiler-rt/test/hwasan/TestCases/print-memory-usage.c
index 2c89d4e70ebc7..13652fbd921b0 100644
--- a/compiler-rt/test/hwasan/TestCases/print-memory-usage.c
+++ b/compiler-rt/test/hwasan/TestCases/print-memory-usage.c
@@ -1,4 +1,5 @@
 // Tests __hwasan_print_memory_usage.
+// REQUIRES: shell
 // RUN: %clang_hwasan %s -o %t
 // RUN: ulimit -s 1000
 // RUN: %run %t 2>&1 | FileCheck %s
diff --git a/compiler-rt/test/msan/Linux/reexec_unlimited_stack.cpp b/compiler-rt/test/msan/Linux/reexec_unlimited_stack.cpp
index 61492ec34533f..8dee27047470e 100644
--- a/compiler-rt/test/msan/Linux/reexec_unlimited_stack.cpp
+++ b/compiler-rt/test/msan/Linux/reexec_unlimited_stack.cpp
@@ -1,5 +1,6 @@
 // MSAN re-execs on unlimited stacks. We use that to verify ReExec() uses the
 // right path.
+// REQUIRES: shell
 // RUN: %clangxx_msan -O0 %s -o %t && ulimit -s unlimited && %run %t | FileCheck %s
 
 #include <stdio.h>
diff --git a/compiler-rt/test/profile/Linux/counter_promo_for.c b/compiler-rt/test/profile/Linux/counter_promo_for.c
index aa77e6084bf85..f59f3e4b34a26 100644
--- a/compiler-rt/test/profile/Linux/counter_promo_for.c
+++ b/compiler-rt/test/profile/Linux/counter_promo_for.c
@@ -12,7 +12,9 @@
 // RUN: %run %t.nopromo.gen
 // RUN: llvm-profdata merge -o %t.nopromo.profdata %t.nopromo.prof/
 // RUN: llvm-profdata show --counts --all-functions %t.nopromo.profdata  > %t.nopromo.dump
-// RUN: diff <(llvm-profdata show %t.promo.profdata) <(llvm-profdata show %t.nopromo.profdata)
+// RUN: llvm-profdata show %t.promo.profdata > %t.promo.dump
+// RUN: llvm-profdata show %t.nopromo.profdata > %t.nopromo.dump
+// RUN: diff %t.promo.dump %t.nopromo.dump
 
 int g;
 __attribute__((noinline)) void bar(int i) { g += i; }
diff --git a/compiler-rt/test/profile/Linux/counter_promo_nest.c b/compiler-rt/test/profile/Linux/counter_promo_nest.c
index ac32d16d706ba..a893108c96e37 100644
--- a/compiler-rt/test/profile/Linux/counter_promo_nest.c
+++ b/compiler-rt/test/profile/Linux/counter_promo_nest.c
@@ -10,7 +10,9 @@
 // RUN: %run %t.nopromo.gen
 // RUN: llvm-profdata merge -o %t.nopromo.profdata %t.nopromo.prof/
 // RUN: llvm-profdata show --counts --all-functions %t.nopromo.profdata  > %t.nopromo.dump
-// RUN: diff <(llvm-profdata show %t.promo.profdata) <(llvm-profdata show %t.nopromo.profdata)
+// RUN: llvm-profdata show %t.promo.profdata > %t.promo.dump
+// RUN: llvm-profdata show %t.nopromo.profdata > %t.nopromo.dump
+// RUN: diff %t.promo.dump %t.nopromo.dump
 int g;
 __attribute__((noinline)) void bar() {
  g++;
diff --git a/compiler-rt/test/profile/Linux/counter_promo_while.c b/compiler-rt/test/profile/Linux/counter_promo_while.c
index c6ea3a7282d42..5471571392682 100644
--- a/compiler-rt/test/profile/Linux/counter_promo_while.c
+++ b/compiler-rt/test/profile/Linux/counter_promo_while.c
@@ -12,7 +12,9 @@
 // RUN: %run %t.nopromo.gen
 // RUN: llvm-profdata merge -o %t.nopromo.profdata %t.nopromo.prof/
 // RUN: llvm-profdata show --counts --all-functions %t.nopromo.profdata  > %t.nopromo.dump
-// RUN: diff <(llvm-profdata show %t.promo.profdata) <(llvm-profdata show %t.nopromo.profdata)
+// RUN: llvm-profdata show %t.promo.profdata > %t.promo.dump
+// RUN: llvm-profdata show %t.nopromo.profdata > %t.nopromo.dump
+// RUN: diff %t.promo.dump %t.nopromo.dump
 int g;
 __attribute__((noinline)) void bar(int i) { g += i; }
 __attribute__((noinline)) void foo(int n, int N) {
diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
index a918d7b629900..426426d9a05a2 100644
--- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
@@ -7,13 +7,17 @@
 // RUN: env LLVM_PROFILE_FILE=%t.d4.proflite %run %t.d4
 // RUN: llvm-profdata merge -o %t.d4.profdata --debug-info=%t.d4 %t.d4.proflite
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.d4.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal.dump
+// RUN: llvm-profdata show --all-functions --counts %t.d4.profdata > %t.d4.dump
+// RUN: diff %t.normal.dump %t.d4.dump
 
 // RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal2.dump
+// RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.prof.dump
+// RUN: diff %t.normal2.dump %t.prof.dump
 
 // RUN: %clang_pgogen -o %t.cov -g -mllvm --debug-info-correlate -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov
@@ -23,7 +27,9 @@
 // RUN: env LLVM_PROFILE_FILE=%t.cov.profraw %run %t.cov.normal
 // RUN: llvm-profdata merge -o %t.cov.normal.profdata %t.cov.profraw
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.cov.normal.profdata) <(llvm-profdata show --all-functions --counts %t.cov.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.cov.normal.profdata > %t.cov.normal.dump
+// RUN: llvm-profdata show --all-functions --counts %t.cov.profdata > %t.cov.dump
+// RUN: diff %t.cov.normal.dump %t.cov.dump
 
 // Test debug info correlate with online merging.
 
@@ -36,11 +42,15 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.profdir/
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal3.dump
+// RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.prof3.dump
+// RUN: diff %t.normal3.dump %t.prof3.dump
 
 // RUN: rm -rf %t.profdir && mkdir %t.profdir
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.cov.proflite %run %t.cov
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.cov.proflite %run %t.cov
 // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov %t.profdir/
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.cov.normal.profdata) <(llvm-profdata show --all-functions --counts %t.cov.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.cov.normal.profdata > %t.cov.normal2.dump
+// RUN: llvm-profdata show --all-functions --counts %t.cov.profdata > %t.cov2.dump
+// RUN: diff %t.cov.normal2.dump %t.cov2.dump
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/vptr.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/vptr.cpp
index 2d0e48cd84f3c..4f2a5e65005ed 100644
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/vptr.cpp
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/vptr.cpp
@@ -23,7 +23,7 @@
 // RUN: %env_ubsan_opts=halt_on_error=1 not %run %t nN 2>&1 | FileCheck %s --check-prefix=CHECK-NULL-MEMFUN --strict-whitespace
 // RUN: %env_ubsan_opts=print_stacktrace=1 %run %t dT 2>&1 | FileCheck %s --check-prefix=CHECK-DYNAMIC --allow-unused-prefixes --check-prefix=CHECK-%os-DYNAMIC --strict-whitespace
 
-// RUN: (echo "vptr_check:S"; echo "vptr_check:T"; echo "vptr_check:U") > %t.supp
+// RUN: echo -e "vptr_check:S\nvptr_check:T\nvptr_check:U" > %t.supp
 // RUN: %env_ubsan_opts=halt_on_error=1:suppressions='"%t.supp"' %run %t mS
 // RUN: %env_ubsan_opts=halt_on_error=1:suppressions='"%t.supp"' %run %t fS
 // RUN: %env_ubsan_opts=halt_on_error=1:suppressions='"%t.supp"' %run %t cS
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index e95af629ef32f..f643674f1d5d6 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -161,10 +161,11 @@ def cuf_DataTransferOp : cuf_Op<"data_transfer", []> {
 
   let arguments = (ins Arg<AnyType, "", [MemRead]>:$src,
                        Arg<AnyRefOrBoxType, "", [MemWrite]>:$dst,
+                       Optional<fir_ShapeType>:$shape,
                        cuf_DataTransferKindAttr:$transfer_kind);
 
   let assemblyFormat = [{
-    $src `to` $dst attr-dict `:` type(operands)
+    $src `to` $dst (`,` $shape^ `:` type($shape) )? attr-dict `:` type($src) `,` type($dst)
   }];
 
   let hasVerifier = 1;
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index 3498a329ced30..68e03eab7268b 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -487,11 +487,18 @@ std::string getTypeAsString(mlir::Type ty, const KindMapping &kindMap,
 /// target dependent type size inquiries in lowering. It would also not be
 /// straightforward given the need for a kind map that would need to be
 /// converted in terms of mlir::DataLayoutEntryKey.
+
+/// This variant terminates the compilation if an unsupported type is passed.
 std::pair<std::uint64_t, unsigned short>
+getTypeSizeAndAlignmentOrCrash(mlir::Location loc, mlir::Type ty,
+                               const mlir::DataLayout &dl,
+                               const fir::KindMapping &kindMap);
+
+/// This variant returns std::nullopt if an unsupported type is passed.
+std::optional<std::pair<uint64_t, unsigned short>>
 getTypeSizeAndAlignment(mlir::Location loc, mlir::Type ty,
                         const mlir::DataLayout &dl,
                         const fir::KindMapping &kindMap);
-
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_DIALECT_FIRTYPE_H
diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h
index e051e86431663..6e1979790e3c6 100644
--- a/flang/include/flang/Runtime/numeric.h
+++ b/flang/include/flang/Runtime/numeric.h
@@ -377,6 +377,8 @@ CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedCharKind)(
 // SELECTED_INT_KIND
 CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedIntKind)(
     const char *, int, void *, int);
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedIntKindMasked)(
+    const char *, int, void *, int, int);
 
 // SELECTED_LOGICAL_KIND
 CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedLogicalKind)(
@@ -385,6 +387,8 @@ CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedLogicalKind)(
 // SELECTED_REAL_KIND
 CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedRealKind)(
     const char *, int, void *, int, void *, int, void *, int);
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedRealKindMasked)(
+    const char *, int, void *, int, void *, int, void *, int, int);
 
 // SPACING
 CppTypeFor<TypeCategory::Real, 4> RTDECL(Spacing4)(
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 5c86bd947ce73..c9991ff18d175 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -323,6 +323,7 @@ bool CodeGenAction::beginSourceFileAction() {
   // run the default passes.
   mlir::PassManager pm((*mlirModule)->getName(),
                        mlir::OpPassManager::Nesting::Implicit);
+  (void)mlir::applyPassManagerCLOptions(pm);
   // Add OpenMP-related passes
   // WARNING: These passes must be run immediately after the lowering to ensure
   // that the FIR is correct with respect to OpenMP operations/attributes.
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index ccbb481f472d8..24cd6b22b8925 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4272,18 +4272,19 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           base = convertOp.getValue();
         // Special case if the rhs is a constant.
         if (matchPattern(base.getDefiningOp(), mlir::m_Constant())) {
-          builder.create<cuf::DataTransferOp>(loc, base, lhsVal,
-                                              transferKindAttr);
+          builder.create<cuf::DataTransferOp>(
+              loc, base, lhsVal, /*shape=*/mlir::Value{}, transferKindAttr);
         } else {
           auto associate = hlfir::genAssociateExpr(
               loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
           builder.create<cuf::DataTransferOp>(loc, associate.getBase(), lhsVal,
+                                              /*shape=*/mlir::Value{},
                                               transferKindAttr);
           builder.create<hlfir::EndAssociateOp>(loc, associate);
         }
       } else {
-        builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
-                                            transferKindAttr);
+        builder.create<cuf::DataTransferOp>(
+            loc, rhsVal, lhsVal, /*shape=*/mlir::Value{}, transferKindAttr);
       }
       return;
     }
@@ -4293,6 +4294,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceHost);
       builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
+                                          /*shape=*/mlir::Value{},
                                           transferKindAttr);
       return;
     }
@@ -4303,6 +4305,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceDevice);
       builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
+                                          /*shape=*/mlir::Value{},
                                           transferKindAttr);
       return;
     }
@@ -4346,8 +4349,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           addSymbol(sym,
                     hlfir::translateToExtendedValue(loc, builder, temp).first,
                     /*forced=*/true);
-          builder.create<cuf::DataTransferOp>(loc, addr, temp,
-                                              transferKindAttr);
+          builder.create<cuf::DataTransferOp>(
+              loc, addr, temp, /*shape=*/mlir::Value{}, transferKindAttr);
           ++nbDeviceResidentObject;
         }
       }
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 25141102a8c43..7bc730bff76fe 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -431,8 +431,8 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
         return byteOffset;
       }
       mlir::Type compType = component.second;
-      auto [compSize, compAlign] =
-          fir::getTypeSizeAndAlignment(loc, compType, getDataLayout(), kindMap);
+      auto [compSize, compAlign] = fir::getTypeSizeAndAlignmentOrCrash(
+          loc, compType, getDataLayout(), kindMap);
       byteOffset = llvm::alignTo(byteOffset, compAlign);
       ArgClass LoComp, HiComp;
       classify(loc, compType, byteOffset, LoComp, HiComp);
@@ -452,8 +452,8 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
                      ArgClass &Hi) const {
     mlir::Type eleTy = seqTy.getEleTy();
     const std::uint64_t arraySize = seqTy.getConstantArraySize();
-    auto [eleSize, eleAlign] =
-        fir::getTypeSizeAndAlignment(loc, eleTy, getDataLayout(), kindMap);
+    auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignmentOrCrash(
+        loc, eleTy, getDataLayout(), kindMap);
     std::uint64_t eleStorageSize = llvm::alignTo(eleSize, eleAlign);
     for (std::uint64_t i = 0; i < arraySize; ++i) {
       byteOffset = llvm::alignTo(byteOffset, eleAlign);
@@ -641,7 +641,7 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
                                                mlir::Type ty) const {
     CodeGenSpecifics::Marshalling marshal;
     auto sizeAndAlign =
-        fir::getTypeSizeAndAlignment(loc, ty, getDataLayout(), kindMap);
+        fir::getTypeSizeAndAlignmentOrCrash(loc, ty, getDataLayout(), kindMap);
     // The stack is always 8 byte aligned (note 14 in 3.2.3).
     unsigned short align =
         std::max(sizeAndAlign.second, static_cast<unsigned short>(8));
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index f7b36b208a7de..3b4ad95cafe6b 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -99,6 +99,11 @@ llvm::LogicalResult cuf::AllocateOp::verify() {
 llvm::LogicalResult cuf::DataTransferOp::verify() {
   mlir::Type srcTy = getSrc().getType();
   mlir::Type dstTy = getDst().getType();
+  if (getShape()) {
+    if (!fir::isa_ref_type(srcTy) || !fir::isa_ref_type(dstTy))
+      return emitOpError()
+             << "shape can only be specified on data transfer with references";
+  }
   if ((fir::isa_ref_type(srcTy) && fir::isa_ref_type(dstTy)) ||
       (fir::isa_box_type(srcTy) && fir::isa_box_type(dstTy)) ||
       (fir::isa_ref_type(srcTy) && fir::isa_box_type(dstTy)) ||
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index dbccacfa8be26..c1debf28d0033 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -1393,43 +1393,50 @@ void FIROpsDialect::registerTypes() {
       OpenACCPointerLikeModel<fir::LLVMPointerType>>(*getContext());
 }
 
-std::pair<std::uint64_t, unsigned short>
+std::optional<std::pair<uint64_t, unsigned short>>
 fir::getTypeSizeAndAlignment(mlir::Location loc, mlir::Type ty,
                              const mlir::DataLayout &dl,
                              const fir::KindMapping &kindMap) {
   if (mlir::isa<mlir::IntegerType, mlir::FloatType, mlir::ComplexType>(ty)) {
     llvm::TypeSize size = dl.getTypeSize(ty);
     unsigned short alignment = dl.getTypeABIAlignment(ty);
-    return {size, alignment};
+    return std::pair{size, alignment};
   }
   if (auto firCmplx = mlir::dyn_cast<fir::ComplexType>(ty)) {
-    auto [floatSize, floatAlign] =
+    auto result =
         getTypeSizeAndAlignment(loc, firCmplx.getEleType(kindMap), dl, kindMap);
-    return {llvm::alignTo(floatSize, floatAlign) + floatSize, floatAlign};
+    if (!result)
+      return result;
+    auto [floatSize, floatAlign] = *result;
+    return std::pair{llvm::alignTo(floatSize, floatAlign) + floatSize,
+                     floatAlign};
   }
   if (auto real = mlir::dyn_cast<fir::RealType>(ty))
     return getTypeSizeAndAlignment(loc, real.getFloatType(kindMap), dl,
                                    kindMap);
 
   if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) {
-    auto [eleSize, eleAlign] =
-        getTypeSizeAndAlignment(loc, seqTy.getEleTy(), dl, kindMap);
-
+    auto result = getTypeSizeAndAlignment(loc, seqTy.getEleTy(), dl, kindMap);
+    if (!result)
+      return result;
+    auto [eleSize, eleAlign] = *result;
     std::uint64_t size =
         llvm::alignTo(eleSize, eleAlign) * seqTy.getConstantArraySize();
-    return {size, eleAlign};
+    return std::pair{size, eleAlign};
   }
   if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty)) {
     std::uint64_t size = 0;
     unsigned short align = 1;
     for (auto component : recTy.getTypeList()) {
-      auto [compSize, compAlign] =
-          getTypeSizeAndAlignment(loc, component.second, dl, kindMap);
+      auto result = getTypeSizeAndAlignment(loc, component.second, dl, kindMap);
+      if (!result)
+        return result;
+      auto [compSize, compAlign] = *result;
       size =
           llvm::alignTo(size, compAlign) + llvm::alignTo(compSize, compAlign);
       align = std::max(align, compAlign);
     }
-    return {size, align};
+    return std::pair{size, align};
   }
   if (auto logical = mlir::dyn_cast<fir::LogicalType>(ty)) {
     mlir::Type intTy = mlir::IntegerType::get(
@@ -1440,7 +1447,24 @@ fir::getTypeSizeAndAlignment(mlir::Location loc, mlir::Type ty,
     mlir::Type intTy = mlir::IntegerType::get(
         character.getContext(),
         kindMap.getCharacterBitsize(character.getFKind()));
-    return getTypeSizeAndAlignment(loc, intTy, dl, kindMap);
+    auto result = getTypeSizeAndAlignment(loc, intTy, dl, kindMap);
+    if (!result)
+      return result;
+    auto [compSize, compAlign] = *result;
+    if (character.hasConstantLen())
+      compSize *= character.getLen();
+    return std::pair{compSize, compAlign};
   }
-  TODO(loc, "computing size of a component");
+  return std::nullopt;
 }
+
+std::pair<std::uint64_t, unsigned short>
+fir::getTypeSizeAndAlignmentOrCrash(mlir::Location loc, mlir::Type ty,
+                                    const mlir::DataLayout &dl,
+                                    const fir::KindMapping &kindMap) {
+  std::optional<std::pair<uint64_t, unsigned short>> result =
+      getTypeSizeAndAlignment(loc, ty, dl, kindMap);
+  if (result)
+    return *result;
+  TODO(loc, "computing size of a component");
+}
\ No newline at end of file
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 30fc4185575e6..46fc40b714aac 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -65,20 +65,15 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
 
   void handleGlobalOp(fir::GlobalOp glocalOp, mlir::LLVM::DIFileAttr fileAttr,
                       mlir::LLVM::DIScopeAttr scope,
+                      fir::DebugTypeGenerator &typeGen,
                       mlir::SymbolTable *symbolTable,
                       fir::cg::XDeclareOp declOp);
   void handleFuncOp(mlir::func::FuncOp funcOp, mlir::LLVM::DIFileAttr fileAttr,
                     mlir::LLVM::DICompileUnitAttr cuAttr,
+                    fir::DebugTypeGenerator &typeGen,
                     mlir::SymbolTable *symbolTable);
 };
 
-static uint32_t getLineFromLoc(mlir::Location loc) {
-  uint32_t line = 1;
-  if (auto fileLoc = mlir::dyn_cast<mlir::FileLineColLoc>(loc))
-    line = fileLoc.getLine();
-  return line;
-}
-
 bool debugInfoIsAlreadySet(mlir::Location loc) {
   if (mlir::isa<mlir::FusedLoc>(loc)) {
     if (loc->findInstanceOf<mlir::FusedLocWith<fir::LocationKindAttr>>())
@@ -103,7 +98,7 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
     return;
   // If this DeclareOp actually represents a global then treat it as such.
   if (auto global = symbolTable->lookup<fir::GlobalOp>(declOp.getUniqName())) {
-    handleGlobalOp(global, fileAttr, scopeAttr, symbolTable, declOp);
+    handleGlobalOp(global, fileAttr, scopeAttr, typeGen, symbolTable, declOp);
     return;
   }
 
@@ -160,19 +155,24 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr(
 void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
                                       mlir::LLVM::DIFileAttr fileAttr,
                                       mlir::LLVM::DIScopeAttr scope,
+                                      fir::DebugTypeGenerator &typeGen,
                                       mlir::SymbolTable *symbolTable,
                                       fir::cg::XDeclareOp declOp) {
   if (debugInfoIsAlreadySet(globalOp.getLoc()))
     return;
-  mlir::ModuleOp module = getOperation();
   mlir::MLIRContext *context = &getContext();
-  fir::DebugTypeGenerator typeGen(module);
   mlir::OpBuilder builder(context);
 
   std::pair result = fir::NameUniquer::deconstruct(globalOp.getSymName());
   if (result.first != fir::NameUniquer::NameKind::VARIABLE)
     return;
 
+  // Discard entries that describe a derived type. Usually start with '.c.',
+  // '.dt.' or '.n.'. It would be better if result of the deconstruct had a flag
+  // for such values so that we dont have to look at string values.
+  if (!result.second.name.empty() && result.second.name[0] == '.')
+    return;
+
   unsigned line = getLineFromLoc(globalOp.getLoc());
 
   // DWARF5 says following about the fortran modules:
@@ -214,6 +214,7 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
 void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
                                     mlir::LLVM::DIFileAttr fileAttr,
                                     mlir::LLVM::DICompileUnitAttr cuAttr,
+                                    fir::DebugTypeGenerator &typeGen,
                                     mlir::SymbolTable *symbolTable) {
   mlir::Location l = funcOp->getLoc();
   // If fused location has already been created then nothing to do
@@ -221,7 +222,6 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
   if (debugInfoIsAlreadySet(l))
     return;
 
-  mlir::ModuleOp module = getOperation();
   mlir::MLIRContext *context = &getContext();
   mlir::OpBuilder builder(context);
   llvm::StringRef fileName(fileAttr.getName());
@@ -245,7 +245,6 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
   funcName = mlir::StringAttr::get(context, result.second.name);
 
   llvm::SmallVector<mlir::LLVM::DITypeAttr> types;
-  fir::DebugTypeGenerator typeGen(module);
   for (auto resTy : funcOp.getResultTypes()) {
     auto tyAttr =
         typeGen.convertType(resTy, fileAttr, cuAttr, /*declOp=*/nullptr);
@@ -285,7 +284,7 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
       if (auto func =
               symbolTable->lookup<mlir::func::FuncOp>(sym.getLeafReference())) {
         // Make sure that parent is processed.
-        handleFuncOp(func, fileAttr, cuAttr, symbolTable);
+        handleFuncOp(func, fileAttr, cuAttr, typeGen, symbolTable);
         if (auto fusedLoc =
                 mlir::dyn_cast_if_present<mlir::FusedLoc>(func.getLoc())) {
           if (auto spAttr =
@@ -302,7 +301,7 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
 
   auto spAttr = mlir::LLVM::DISubprogramAttr::get(
       context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
-      line, line, subprogramFlags, subTypeAttr);
+      line, line, subprogramFlags, subTypeAttr, /*retainedNodes=*/{});
   funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
 
   // Don't process variables if user asked for line tables only.
@@ -320,6 +319,14 @@ void AddDebugInfoPass::runOnOperation() {
   mlir::SymbolTable symbolTable(module);
   llvm::StringRef fileName;
   std::string filePath;
+  std::optional<mlir::DataLayout> dl =
+      fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/true);
+  if (!dl) {
+    mlir::emitError(module.getLoc(), "Missing data layout attribute in module");
+    signalPassFailure();
+    return;
+  }
+  fir::DebugTypeGenerator typeGen(module, &symbolTable, *dl);
   // We need 2 type of file paths here.
   // 1. Name of the file as was presented to compiler. This can be absolute
   // or relative to 2.
@@ -354,13 +361,13 @@ void AddDebugInfoPass::runOnOperation() {
       isOptimized, debugLevel);
 
   module.walk([&](mlir::func::FuncOp funcOp) {
-    handleFuncOp(funcOp, fileAttr, cuAttr, &symbolTable);
+    handleFuncOp(funcOp, fileAttr, cuAttr, typeGen, &symbolTable);
   });
   // Process any global which was not processed through DeclareOp.
   if (debugLevel == mlir::LLVM::DIEmissionKind::Full) {
     // Process 'GlobalOp' only if full debug info is requested.
     for (auto globalOp : module.getOps<fir::GlobalOp>())
-      handleGlobalOp(globalOp, fileAttr, cuAttr, &symbolTable,
+      handleGlobalOp(globalOp, fileAttr, cuAttr, typeGen, &symbolTable,
                      /*declOp=*/nullptr);
   }
 }
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 860c16c9a13ce..54f2a12d80008 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -15,7 +15,7 @@
 #include "DebugTypeGenerator.h"
 #include "flang/Optimizer/CodeGen/DescriptorModel.h"
 #include "flang/Optimizer/CodeGen/TypeConverter.h"
-#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Support/InternalNames.h"
 #include "mlir/Pass/Pass.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -44,17 +44,13 @@ std::uint64_t getComponentOffset<0>(const mlir::DataLayout &dl,
   return 0;
 }
 
-DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m)
-    : module(m), kindMapping(getKindMapping(m)) {
+DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m,
+                                       mlir::SymbolTable *symbolTable_,
+                                       const mlir::DataLayout &dl)
+    : module(m), symbolTable(symbolTable_), dataLayout{&dl},
+      kindMapping(getKindMapping(m)) {
   LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n");
 
-  std::optional<mlir::DataLayout> dl =
-      fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/true);
-  if (!dl) {
-    mlir::emitError(module.getLoc(), "Missing data layout attribute in module");
-    return;
-  }
-
   mlir::MLIRContext *context = module.getContext();
 
   // The debug information requires the offset of certain fields in the
@@ -62,10 +58,12 @@ DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m)
   mlir::Type llvmDimsType = getDescFieldTypeModel<kDimsPosInBox>()(context);
   mlir::Type llvmPtrType = getDescFieldTypeModel<kAddrPosInBox>()(context);
   mlir::Type llvmLenType = getDescFieldTypeModel<kElemLenPosInBox>()(context);
-  dimsOffset = getComponentOffset<kDimsPosInBox>(*dl, context, llvmDimsType);
-  dimsSize = dl->getTypeSize(llvmDimsType);
-  ptrSize = dl->getTypeSize(llvmPtrType);
-  lenOffset = getComponentOffset<kElemLenPosInBox>(*dl, context, llvmLenType);
+  dimsOffset =
+      getComponentOffset<kDimsPosInBox>(*dataLayout, context, llvmDimsType);
+  dimsSize = dataLayout->getTypeSize(llvmDimsType);
+  ptrSize = dataLayout->getTypeSize(llvmPtrType);
+  lenOffset =
+      getComponentOffset<kElemLenPosInBox>(*dataLayout, context, llvmLenType);
 }
 
 static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
@@ -154,6 +152,49 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
       dataLocation, /*rank=*/nullptr, allocated, associated);
 }
 
+mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
+    fir::RecordType Ty, mlir::LLVM::DIFileAttr fileAttr,
+    mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp) {
+  mlir::MLIRContext *context = module.getContext();
+  auto result = fir::NameUniquer::deconstruct(Ty.getName());
+  if (result.first != fir::NameUniquer::NameKind::DERIVED_TYPE)
+    return genPlaceholderType(context);
+
+  fir::TypeInfoOp tiOp = symbolTable->lookup<fir::TypeInfoOp>(Ty.getName());
+  unsigned line = (tiOp) ? getLineFromLoc(tiOp.getLoc()) : 1;
+
+  llvm::SmallVector<mlir::LLVM::DINodeAttr> elements;
+  std::uint64_t offset = 0;
+  for (auto [fieldName, fieldTy] : Ty.getTypeList()) {
+    auto result = fir::getTypeSizeAndAlignment(module.getLoc(), fieldTy,
+                                               *dataLayout, kindMapping);
+    // If we get a type whose size we can't determine, we will break the loop
+    // and generate the derived type with whatever components we have
+    // assembled thus far.
+    if (!result)
+      break;
+    auto [byteSize, byteAlign] = *result;
+    // FIXME: Handle non defaults array bound in derived types
+    mlir::LLVM::DITypeAttr elemTy =
+        convertType(fieldTy, fileAttr, scope, /*declOp=*/nullptr);
+    offset = llvm::alignTo(offset, byteAlign);
+    mlir::LLVM::DIDerivedTypeAttr tyAttr = mlir::LLVM::DIDerivedTypeAttr::get(
+        context, llvm::dwarf::DW_TAG_member,
+        mlir::StringAttr::get(context, fieldName), elemTy, byteSize * 8,
+        byteAlign * 8, offset * 8, /*optional<address space>=*/std::nullopt,
+        /*extra data=*/nullptr);
+    elements.push_back(tyAttr);
+    offset += llvm::alignTo(byteSize, byteAlign);
+  }
+
+  return mlir::LLVM::DICompositeTypeAttr::get(
+      context, llvm::dwarf::DW_TAG_structure_type, /*recursive_id=*/{},
+      mlir::StringAttr::get(context, result.second.name), fileAttr, line, scope,
+      /*baseType=*/nullptr, mlir::LLVM::DIFlags::Zero, offset * 8,
+      /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr,
+      /*allocated=*/nullptr, /*associated=*/nullptr);
+}
+
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
     fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr,
     mlir::LLVM::DIScopeAttr scope, fir::cg::XDeclareOp declOp) {
@@ -312,6 +353,8 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
   } else if (auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(Ty)) {
     return convertCharacterType(charTy, fileAttr, scope, declOp,
                                 /*hasDescriptor=*/false);
+  } else if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(Ty)) {
+    return convertRecordType(recTy, fileAttr, scope, declOp);
   } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BoxType>(Ty)) {
     auto elTy = boxTy.getElementType();
     if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(elTy))
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
index 5ab6ca5e9f880..e3220f18958df 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
@@ -17,6 +17,7 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "flang/Optimizer/Support/DataLayout.h"
 #include "llvm/Support/Debug.h"
 
 namespace fir {
@@ -24,7 +25,8 @@ namespace fir {
 /// This converts FIR/mlir type to DITypeAttr.
 class DebugTypeGenerator {
 public:
-  DebugTypeGenerator(mlir::ModuleOp module);
+  DebugTypeGenerator(mlir::ModuleOp module, mlir::SymbolTable *symbolTable,
+                     const mlir::DataLayout &dl);
 
   mlir::LLVM::DITypeAttr convertType(mlir::Type Ty,
                                      mlir::LLVM::DIFileAttr fileAttr,
@@ -32,6 +34,10 @@ class DebugTypeGenerator {
                                      fir::cg::XDeclareOp declOp);
 
 private:
+  mlir::LLVM::DITypeAttr convertRecordType(fir::RecordType Ty,
+                                           mlir::LLVM::DIFileAttr fileAttr,
+                                           mlir::LLVM::DIScopeAttr scope,
+                                           fir::cg::XDeclareOp declOp);
   mlir::LLVM::DITypeAttr convertSequenceType(fir::SequenceType seqTy,
                                              mlir::LLVM::DIFileAttr fileAttr,
                                              mlir::LLVM::DIScopeAttr scope,
@@ -59,6 +65,8 @@ class DebugTypeGenerator {
                                                 bool genAssociated);
 
   mlir::ModuleOp module;
+  mlir::SymbolTable *symbolTable;
+  const mlir::DataLayout *dataLayout;
   KindMapping kindMapping;
   std::uint64_t dimsSize;
   std::uint64_t dimsOffset;
@@ -68,4 +76,11 @@ class DebugTypeGenerator {
 
 } // namespace fir
 
+static uint32_t getLineFromLoc(mlir::Location loc) {
+  uint32_t line = 1;
+  if (auto fileLoc = mlir::dyn_cast<mlir::FileLineColLoc>(loc))
+    line = fileLoc.getLine();
+  return line;
+}
+
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_DEBUGTYPEGENERATOR_H
diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index 38cdc2b1388d4..51dc48f0fcb12 100644
--- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -266,7 +266,7 @@ void LoopVersioningPass::runOnOperation() {
         if (mlir::isa<mlir::FloatType>(elementType) ||
             mlir::isa<mlir::IntegerType>(elementType) ||
             mlir::isa<fir::ComplexType>(elementType)) {
-          auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignment(
+          auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignmentOrCrash(
               arg.getLoc(), elementType, *dl, kindMap);
           typeSize = llvm::alignTo(eleSize, eleAlign);
         }
diff --git a/flang/lib/Parser/io-parsers.cpp b/flang/lib/Parser/io-parsers.cpp
index ca0dbedc8da42..25b09efd40c52 100644
--- a/flang/lib/Parser/io-parsers.cpp
+++ b/flang/lib/Parser/io-parsers.cpp
@@ -27,7 +27,8 @@ TYPE_PARSER(construct<IoUnit>(variable / lookAhead(space / ",);\n"_ch)) ||
     construct<IoUnit>(fileUnitNumber) || construct<IoUnit>(star))
 
 // R1202 file-unit-number -> scalar-int-expr
-TYPE_PARSER(construct<FileUnitNumber>(scalarIntExpr / !"="_tok))
+TYPE_PARSER(construct<FileUnitNumber>(
+    scalarIntExpr / (lookAhead(space >> ",)"_ch) || atEndOfStmt)))
 
 // R1204 open-stmt -> OPEN ( connect-spec-list )
 TYPE_CONTEXT_PARSER("OPEN statement"_en_US,
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index c01d512b4653d..804ada7d11e02 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -207,11 +207,13 @@ void Prescanner::Statement() {
           toks.Put(id, GetProvenance(at_));
           if (auto replaced{preprocessor_.MacroReplacement(toks, *this)}) {
             auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
-            disableSourceContinuation_ =
-                newLineClass.kind != LineClassification::Kind::Source;
             if (newLineClass.kind ==
                 LineClassification::Kind::CompilerDirective) {
               directiveSentinel_ = newLineClass.sentinel;
+              disableSourceContinuation_ = false;
+            } else {
+              disableSourceContinuation_ =
+                  newLineClass.kind != LineClassification::Kind::Source;
             }
           }
         }
@@ -1114,39 +1116,33 @@ bool Prescanner::SkipCommentLine(bool afterAmpersand) {
       SkipToEndOfLine();
       omitNewline_ = true;
     }
-    return false;
-  }
-  auto lineClass{ClassifyLine(nextLine_)};
-  if (lineClass.kind == LineClassification::Kind::Comment) {
-    NextLine();
-    return true;
   } else if (inPreprocessorDirective_) {
-    return false;
-  } else if (afterAmpersand &&
-      (lineClass.kind ==
-              LineClassification::Kind::ConditionalCompilationDirective ||
-          lineClass.kind == LineClassification::Kind::DefinitionDirective ||
-          lineClass.kind == LineClassification::Kind::PreprocessorDirective ||
-          lineClass.kind == LineClassification::Kind::IncludeDirective ||
-          lineClass.kind == LineClassification::Kind::IncludeLine)) {
-    SkipToEndOfLine();
-    omitNewline_ = true;
-    skipLeadingAmpersand_ = true;
-    return false;
-  } else if (lineClass.kind ==
-          LineClassification::Kind::ConditionalCompilationDirective ||
-      lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
-    // Allow conditional compilation directives (e.g., #ifdef) to affect
-    // continuation lines.
-    // Allow other preprocessor directives, too, except #include
-    // (when it does not follow '&'), #define, and #undef (because
-    // they cannot be allowed to affect preceding text on a
-    // continued line).
-    preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
-    return true;
   } else {
-    return false;
+    auto lineClass{ClassifyLine(nextLine_)};
+    if (lineClass.kind == LineClassification::Kind::Comment) {
+      NextLine();
+      return true;
+    } else if (lineClass.kind ==
+            LineClassification::Kind::ConditionalCompilationDirective ||
+        lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
+      // Allow conditional compilation directives (e.g., #ifdef) to affect
+      // continuation lines.
+      // Allow other preprocessor directives, too, except #include
+      // (when it does not follow '&'), #define, and #undef (because
+      // they cannot be allowed to affect preceding text on a
+      // continued line).
+      preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
+      return true;
+    } else if (afterAmpersand &&
+        (lineClass.kind == LineClassification::Kind::DefinitionDirective ||
+            lineClass.kind == LineClassification::Kind::IncludeDirective ||
+            lineClass.kind == LineClassification::Kind::IncludeLine)) {
+      SkipToEndOfLine();
+      omitNewline_ = true;
+      skipLeadingAmpersand_ = true;
+    }
   }
+  return false;
 }
 
 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 4708d51d3af4d..c7ec873365564 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -56,6 +56,10 @@ static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
         "%VAL argument must be a scalar numeric or logical expression"_err_en_US);
   }
   if (const auto *expr{arg.UnwrapExpr()}) {
+    if (const Symbol * base{GetFirstSymbol(*expr)};
+        base && IsFunctionResult(*base)) {
+      context.NoteDefinedSymbol(*base);
+    }
     if (IsBOZLiteral(*expr)) {
       messages.Say("BOZ argument requires an explicit interface"_err_en_US);
     } else if (evaluate::IsNullPointer(*expr)) {
@@ -79,10 +83,6 @@ static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
         messages.Say(
             "VOLATILE argument requires an explicit interface"_err_en_US);
       }
-      if (const Symbol & base{named->GetFirstSymbol()};
-          IsFunctionResult(base)) {
-        context.NoteDefinedSymbol(base);
-      }
     } else if (auto argChars{characteristics::DummyArgument::FromActual(
                    "actual argument", *expr, context.foldingContext(),
                    /*forImplicitInterface=*/true)}) {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index de3fa8794caed..734c34276b13b 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -256,6 +256,9 @@ static bool IsBlockData(const Symbol &symbol) {
 }
 
 void CheckHelper::Check(const Symbol &symbol) {
+  if (symbol.has<UseErrorDetails>()) {
+    return;
+  }
   if (symbol.name().size() > common::maxNameLen &&
       &symbol == &symbol.GetUltimate()) {
     if (context_.ShouldWarn(common::LanguageFeature::LongNames)) {
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index c0478fd439007..ec8f854f64d10 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1797,6 +1797,9 @@ void AttrsVisitor::SetBindNameOn(Symbol &symbol) {
     }
     auto last{label->find_last_not_of(" ")};
     label = label->substr(first, last - first + 1);
+  } else if (symbol.GetIsExplicitBindName()) {
+    // don't try to override explicit binding name with default
+    return;
   } else if (ClassifyProcedure(symbol) == ProcedureDefinitionClass::Internal) {
     // BIND(C) does not give an implicit binding label to internal procedures.
     return;
diff --git a/flang/runtime/external-unit.cpp b/flang/runtime/external-unit.cpp
index 8009151a8a370..d17a92622f844 100644
--- a/flang/runtime/external-unit.cpp
+++ b/flang/runtime/external-unit.cpp
@@ -65,9 +65,13 @@ ExternalFileUnit *ExternalFileUnit::LookUpOrCreateAnonymous(int unit,
   bool exists{false};
   ExternalFileUnit *result{GetUnitMap().LookUpOrCreate(unit, handler, exists)};
   if (result && !exists) {
+    common::optional<Action> action;
+    if (dir == Direction::Output) {
+      action = Action::ReadWrite;
+    }
     if (!result->OpenAnonymousUnit(
             dir == Direction::Input ? OpenStatus::Unknown : OpenStatus::Replace,
-            Action::ReadWrite, Position::Rewind, Convert::Unknown, handler)) {
+            action, Position::Rewind, Convert::Unknown, handler)) {
       // fort.N isn't a writable file
       if (ExternalFileUnit * closed{LookUpForClose(result->unitNumber())}) {
         closed->DestroyClosed();
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index 40bacf07157a2..b5e0851a16cd1 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -95,20 +95,22 @@ template <typename T> inline RT_API_ATTRS T Scale(T x, std::int64_t p) {
 }
 
 // SELECTED_INT_KIND (16.9.169)
-template <typename T>
-inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedIntKind(T x) {
-  if (x <= 2) {
+template <typename X, typename M>
+inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedIntKind(
+    X x, M mask) {
+#if !defined __SIZEOF_INT128__ || defined FLANG_RUNTIME_NO_INTEGER_16
+  mask &= ~(1 << 16);
+#endif
+  if (x <= 2 && (mask & (1 << 1))) {
     return 1;
-  } else if (x <= 4) {
+  } else if (x <= 4 && (mask & (1 << 2))) {
     return 2;
-  } else if (x <= 9) {
+  } else if (x <= 9 && (mask & (1 << 4))) {
     return 4;
-  } else if (x <= 18) {
+  } else if (x <= 18 && (mask & (1 << 8))) {
     return 8;
-#if defined __SIZEOF_INT128__ && !defined FLANG_RUNTIME_NO_INTEGER_16
-  } else if (x <= 38) {
+  } else if (x <= 38 && (mask & (1 << 16))) {
     return 16;
-#endif
   }
   return -1;
 }
@@ -130,60 +132,52 @@ inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedLogicalKind(
 }
 
 // SELECTED_REAL_KIND (16.9.170)
-template <typename P, typename R, typename D>
+template <typename P, typename R, typename D, typename M>
 inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedRealKind(
-    P p, R r, D d) {
+    P p, R r, D d, M mask) {
   if (d != 2) {
     return -5;
   }
-
-#ifndef FLANG_RUNTIME_NO_REAL_2
-  constexpr bool hasReal2{true};
-#else
-  constexpr bool hasReal2{false};
+#ifdef FLANG_RUNTIME_NO_REAL_2
+  mask &= ~(1 << 2);
 #endif
-#ifndef FLANG_RUNTIME_NO_REAL_3
-  constexpr bool hasReal3{true};
-#else
-  constexpr bool hasReal3{false};
+#ifdef FLANG_RUNTIME_NO_REAL_3
+  mask &= ~(1 << 3);
 #endif
-#if defined LDBL_MANT_DIG == 64 && !defined FLANG_RUNTIME_NO_REAL_10
-  constexpr bool hasReal10{true};
-#else
-  constexpr bool hasReal10{false};
+#if LDBL_MANT_DIG < 64 || defined FLANG_RUNTIME_NO_REAL_10
+  mask &= ~(1 << 10);
 #endif
-#if (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && \
-    !defined FLANG_RUNTIME_NO_REAL_16
-  constexpr bool hasReal16{true};
-#else
-  constexpr bool hasReal16{false};
+#if LDBL_MANT_DIG < 64 || defined FLANG_RUNTIME_NO_REAL_16
+  mask &= ~(1 << 16);
 #endif
 
   int error{0};
   int kind{0};
-  if (hasReal2 && p <= 3) {
+  if (p <= 3 && (mask & (1 << 2))) {
     kind = 2;
-  } else if (p <= 6) {
+  } else if (p <= 6 && (mask & (1 << 4))) {
     kind = 4;
-  } else if (p <= 15) {
+  } else if (p <= 15 && (mask & (1 << 8))) {
     kind = 8;
-  } else if (hasReal10 && p <= 18) {
+  } else if (p <= 18 && (mask & (1 << 10))) {
     kind = 10;
-  } else if (hasReal16 && p <= 33) {
+  } else if (p <= 33 && (mask & (1 << 16))) {
     kind = 16;
   } else {
     error -= 1;
   }
 
-  if (r <= 4) {
-    kind = kind < 2 ? (hasReal2 ? 2 : 4) : kind;
-  } else if (r <= 37) {
-    kind = kind < 3 ? (hasReal3 && p != 3 ? 3 : 4) : kind;
-  } else if (r <= 307) {
+  if (r <= 4 && (mask & (1 << 2))) {
+    kind = kind < 2 ? 2 : kind;
+  } else if (r <= 37 && p != 3 && (mask & (1 << 3))) {
+    kind = kind < 3 ? 3 : kind;
+  } else if (r <= 37 && (mask & (1 << 4))) {
+    kind = kind < 4 ? 4 : kind;
+  } else if (r <= 307 && (mask & (1 << 8))) {
     kind = kind < 8 ? 8 : kind;
-  } else if (hasReal10 && r <= 4931) {
+  } else if (r <= 4931 && (mask & (1 << 10))) {
     kind = kind < 10 ? 10 : kind;
-  } else if (hasReal16 && r <= 4931) {
+  } else if (r <= 4931 && (mask & (1 << 16))) {
     kind = kind < 16 ? 16 : kind;
   } else {
     error -= 2;
@@ -790,6 +784,12 @@ CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedCharKind)(
 // SELECTED_INT_KIND
 CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedIntKind)(
     const char *source, int line, void *x, int xKind) {
+  return RTNAME(SelectedIntKindMasked)(source, line, x, xKind,
+      (1 << 1) | (1 << 2) | (1 << 4) | (1 << 8) | (1 << 16));
+}
+
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedIntKindMasked)(
+    const char *source, int line, void *x, int xKind, int mask) {
 #ifdef __SIZEOF_INT128__
   CppTypeFor<TypeCategory::Integer, 16> r =
       GetIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
@@ -798,7 +798,7 @@ CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedIntKind)(
   std::int64_t r = GetIntArgValue<std::int64_t>(
       source, line, x, xKind, /*defaultValue*/ 0, /*resKind*/ 8);
 #endif
-  return SelectedIntKind(r);
+  return SelectedIntKind(r, mask);
 }
 
 // SELECTED_LOGICAL_KIND
@@ -819,6 +819,14 @@ CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedLogicalKind)(
 CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedRealKind)(const char *source,
     int line, void *precision, int pKind, void *range, int rKind, void *radix,
     int dKind) {
+  return RTNAME(SelectedRealKindMasked)(source, line, precision, pKind, range,
+      rKind, radix, dKind,
+      (1 << 2) | (1 << 3) | (1 << 4) | (1 << 8) | (1 << 10) | (1 << 16));
+}
+
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedRealKindMasked)(
+    const char *source, int line, void *precision, int pKind, void *range,
+    int rKind, void *radix, int dKind, int mask) {
 #ifdef __SIZEOF_INT128__
   CppTypeFor<TypeCategory::Integer, 16> p =
       GetIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
@@ -837,7 +845,7 @@ CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedRealKind)(const char *source,
   std::int64_t d = GetIntArgValue<std::int64_t>(
       source, line, radix, dKind, /*defaultValue*/ 2, /*resKind*/ 8);
 #endif
-  return SelectedRealKind(p, r, d);
+  return SelectedRealKind(p, r, d, mask);
 }
 
 CppTypeFor<TypeCategory::Real, 4> RTDEF(Spacing4)(
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index a6316ee7c8312..e44f4e62a7148 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -22,6 +22,9 @@
 ! DEBUG-ERR: error: invalid value 'invalid' in '-debug-info-kind=invalid'
 ! DEBUG-ERR-NOT: Pass statistics report
 
+! ALL: Pass statistics report
+! ALL: Fortran::lower::VerifierPass
+
 ! ALL: Pass statistics report
 
 ! ALL: Fortran::lower::VerifierPass
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 2f35f928e99cf..6c2829d3cc5c5 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -9,6 +9,9 @@
 
 end program
 
+! ALL: Pass statistics report
+! ALL: Fortran::lower::VerifierPass
+
 ! ALL: Pass statistics report
 
 ! ALL: Fortran::lower::VerifierPass
diff --git a/flang/test/Driver/mmlir-opts-vs-opts.f90 b/flang/test/Driver/mmlir-opts-vs-opts.f90
new file mode 100644
index 0000000000000..8cd14e02c3fcc
--- /dev/null
+++ b/flang/test/Driver/mmlir-opts-vs-opts.f90
@@ -0,0 +1,8 @@
+! Verify that mlir pass options are only accessible under `-mmlir`.
+
+!RUN: %flang_fc1 -emit-hlfir -mmlir -mlir-pass-statistics %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=MMLIR
+!RUN: not %flang_fc1 -emit-hlfir -mlir-pass-statistics %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=NOMMLIR
+
+!MMLIR: Pass statistics report
+!NOMMLIR: error: unknown argument: '-mlir-pass-statistics'
+end
diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir
index 06e08d14b2435..e9aeaa281e2a8 100644
--- a/flang/test/Fir/cuf-invalid.fir
+++ b/flang/test/Fir/cuf-invalid.fir
@@ -94,3 +94,34 @@ func.func @_QPsub1() {
   cuf.free %0 : !fir.ref<f32> {data_attr = #cuf.cuda<constant>}
   return
 }
+
+// -----
+
+func.func @_QPsub1(%arg0: !fir.ref<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "adev"}, %arg1: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "ahost"}, %arg2: !fir.ref<i32> {fir.bindc_name = "n"}, %arg3: !fir.ref<i32> {fir.bindc_name = "m"}) {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFsub1En"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %2:2 = hlfir.declare %arg3 dummy_scope %0 {uniq_name = "_QFsub1Em"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3 = fir.load %1#0 : !fir.ref<i32>
+  %4 = fir.load %2#0 : !fir.ref<i32>
+  %5 = arith.muli %3, %4 : i32
+  %6 = fir.convert %5 : (i32) -> i64
+  %7 = fir.convert %6 : (i64) -> index
+  %c0 = arith.constant 0 : index
+  %8 = arith.cmpi sgt, %7, %c0 : index
+  %9 = arith.select %8, %7, %c0 : index
+  %10 = fir.shape %9 : (index) -> !fir.shape<1>
+  %11:2 = hlfir.declare %arg0(%10) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Eadev"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+  %12 = fir.load %1#0 : !fir.ref<i32>
+  %13 = fir.load %2#0 : !fir.ref<i32>
+  %14 = arith.muli %12, %13 : i32
+  %15 = fir.convert %14 : (i32) -> i64
+  %16 = fir.convert %15 : (i64) -> index
+  %c0_0 = arith.constant 0 : index
+  %17 = arith.cmpi sgt, %16, %c0_0 : index
+  %18 = arith.select %17, %16, %c0_0 : index
+  %19 = fir.shape %18 : (index) -> !fir.shape<1>
+  %20:2 = hlfir.declare %arg1(%19) dummy_scope %0 {uniq_name = "_QFsub1Eahost"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+  // expected-error@+1{{'cuf.data_transfer' op shape can only be specified on data transfer with references}}
+  cuf.data_transfer %20#0 to %11#0, %19 : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>
+  return
+}
diff --git a/flang/test/Integration/debug-cyclic-derived-type.f90 b/flang/test/Integration/debug-cyclic-derived-type.f90
new file mode 100644
index 0000000000000..03e06336a6e08
--- /dev/null
+++ b/flang/test/Integration/debug-cyclic-derived-type.f90
@@ -0,0 +1,15 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+module m
+ type t1
+   type(t2), pointer :: p
+ end type
+ type t2
+   type(t1) :: v1
+ end type
+ type(t1) :: v2
+ type(t2) :: v3
+end module
+
+! CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}})
+! CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "t2"{{.*}})
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
index 833976ff284a8..5f09371bbaba2 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
@@ -57,6 +57,5 @@ end program compilation_to_obj
 ! LLVM: @[[GLOB_VAR:[^[:space:]]+]]t = internal global
 
 ! LLVM: define internal void @_QQmain..omp_par
-! LLVM: %[[LOCAL_VAR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
-! LLVM-NEXT: %[[GLOB_VAL:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr @[[GLOB_VAR]]t, align 8
-! LLVM-NEXT: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[GLOB_VAL]], ptr %[[LOCAL_VAR]], align 8
+! LLVM:      %[[GLOB_VAL:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr @[[GLOB_VAR]]t, align 8
+! LLVM-NEXT: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[GLOB_VAL]], ptr %{{.*}}, align 8
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
index 1457be05ca102..262075ec9b25d 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90
@@ -26,10 +26,10 @@ end subroutine proc
 !CHECK:  store i32 %[[TID]], ptr %[[TID_LOCAL]]
 !CHECK:  %[[F_priv:.*]] = alloca ptr
 !CHECK:  %[[I_priv:.*]] = alloca i32
-!CHECK:  store ptr %{{.*}}, ptr %[[F_priv]]
 
 !CHECK: omp.reduction.init:
-!CHECK: store i32 0, ptr %[[I_priv]]
+!CHECK:  store ptr %{{.*}}, ptr %[[F_priv]]
+!CHECK:  store i32 0, ptr %[[I_priv]]
 
 !CHECK: omp.par.region:
 !CHECK:  br label %[[MALLOC_BB:.*]]
diff --git a/flang/test/Parser/recovery05.f90 b/flang/test/Parser/recovery05.f90
new file mode 100644
index 0000000000000..9c8c3689b27bd
--- /dev/null
+++ b/flang/test/Parser/recovery05.f90
@@ -0,0 +1,5 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+continue
+! CHECK: error: expected end of statement
+flush iostat=1
+end
diff --git a/flang/test/Preprocessing/line-in-contin.F90 b/flang/test/Preprocessing/line-in-contin.F90
index 138e579bffaa2..28efbd02d3ae8 100644
--- a/flang/test/Preprocessing/line-in-contin.F90
+++ b/flang/test/Preprocessing/line-in-contin.F90
@@ -1,8 +1,10 @@
-! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s
-! CHECK: call foo( 0.)
-! CHECK: call foo( 1.)
-! CHECK: call foo( 2.)
-! CHECK: call foo( 3.)
+! RUN: %flang_fc1 -fopenmp -E %s 2>&1 | FileCheck %s
+! CHECK: call foo(0.)
+! CHECK: call foo(1.)
+! CHECK: call foo(2.)
+! CHECK: call foo(3.)
+! CHECK: !$omp parallel do default(none) private(j)
+! CHECK: !$omp end parallel do
 call foo( &
 # 100 "bar.h"
          & 0.)
@@ -17,4 +19,16 @@
 # 103 "bar.h"
          & 3. &
     )
+!$omp parallel do &
+#ifdef undef
+!$omp garbage &
+#else
+!$omp default(none) &
+#endif
+!$omp private(j)
+  do j=1,100
+  end do
+!$omp end &
+# 104 "bar.h"
+!$omp parallel do
 end
diff --git a/flang/test/Semantics/declarations03.f90 b/flang/test/Semantics/declarations03.f90
index 65b07e7d5c656..8e6f0a4aaf6bd 100644
--- a/flang/test/Semantics/declarations03.f90
+++ b/flang/test/Semantics/declarations03.f90
@@ -50,6 +50,9 @@ module m
   !ERROR: BIND_C attribute was already specified on 's5'
   integer, bind(c, name="ss2") :: s5
 
+  integer, bind(c, name="s6explicit") :: s6
+  dimension s6(10) ! caused spurious error
+
 end
 
 subroutine common1()
diff --git a/flang/test/Semantics/resolve82.f90 b/flang/test/Semantics/resolve82.f90
index 88339742efdb3..989ce1d837c70 100644
--- a/flang/test/Semantics/resolve82.f90
+++ b/flang/test/Semantics/resolve82.f90
@@ -34,6 +34,7 @@ end function procFunc
   real y
   common /blk/ y
   protected y
+  logical,protected,external,pointer :: z
 
 contains
 
@@ -60,3 +61,8 @@ subroutine testProcDecl(arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11)
     end subroutine testProcDecl
 
 end module m
+
+subroutine subb()
+  !Ensure no spurious error from a benign UseError
+  use m, testProcDecl=>z
+end
diff --git a/flang/test/Semantics/undef-result01.f90 b/flang/test/Semantics/undef-result01.f90
index bf6af11a8d7b9..08e7fe1e44899 100644
--- a/flang/test/Semantics/undef-result01.f90
+++ b/flang/test/Semantics/undef-result01.f90
@@ -148,3 +148,8 @@ function defdByAssociate()
     s = 1.
   end associate
 end
+
+function defdByElementArgToImplicit() result(r)
+  real r(1)
+  call define(r(1))
+end
diff --git a/flang/test/Transforms/debug-derived-type-1.fir b/flang/test/Transforms/debug-derived-type-1.fir
new file mode 100644
index 0000000000000..e453db6ae6fbb
--- /dev/null
+++ b/flang/test/Transforms/debug-derived-type-1.fir
@@ -0,0 +1,73 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+// Only enabled on x86_64
+// REQUIRES: x86-registered-target
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_gpu = false, omp.is_target_device = false, omp.version = #omp.version<version = 11>} {
+  fir.global @_QMm_employeeEemployee : !fir.type<_QMm_employeeTt_employee{t_person:!fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}>,hired_date:!fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}>,monthly_salary:f32}> {
+    %0 = fir.zero_bits !fir.type<_QMm_employeeTt_employee{t_person:!fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}>,hired_date:!fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}>,monthly_salary:f32}>
+    fir.has_value %0 : !fir.type<_QMm_employeeTt_employee{t_person:!fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}>,hired_date:!fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}>,monthly_salary:f32}>
+  } loc(#loc5)
+  fir.global @_QMt1Evar : !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}> {
+    %0 = fir.zero_bits !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}>
+    fir.has_value %0 : !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}>
+  } loc(#loc6)
+  fir.type_info @_QMt1Tt_t1 noinit nodestroy nofinal : !fir.type<_QMt1Tt_t1{age:i32,points:!fir.array<3x!fir.complex<4>>,cond:!fir.logical<1>,name:!fir.char<1,20>,ratio:f64}> loc(#loc7)
+  fir.type_info @_QMm_employeeTt_address noinit nodestroy nofinal : !fir.type<_QMm_employeeTt_address{house_number:i32}> loc(#loc1)
+  fir.type_info @_QMm_employeeTt_person noinit nodestroy nofinal extends !fir.type<_QMm_employeeTt_address{house_number:i32}> : !fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}> loc(#loc2)
+  fir.type_info @_QMm_employeeTt_date noinit nodestroy nofinal : !fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}> loc(#loc3)
+  fir.type_info @_QMm_employeeTt_employee noinit nodestroy nofinal extends !fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}> : !fir.type<_QMm_employeeTt_employee{t_person:!fir.type<_QMm_employeeTt_person{t_address:!fir.type<_QMm_employeeTt_address{house_number:i32}>,name:!fir.char<1,20>}>,hired_date:!fir.type<_QMm_employeeTt_date{year:i32,month:i32,day:i32}>,monthly_salary:f32}> loc(#loc4)
+  fir.type_info @_QFTt_pair noinit nodestroy nofinal : !fir.type<_QFTt_pair{i:i64,x:f64}> loc(#loc8)
+  func.func @_QQmain() attributes {fir.bindc_name = "test"} {
+    %1 = fir.alloca !fir.type<_QFTt_pair{i:i64,x:f64}> {bindc_name = "pair", uniq_name = "_QFEpair"}
+    %2 = fircg.ext_declare %1 {uniq_name = "_QFEpair"} : (!fir.ref<!fir.type<_QFTt_pair{i:i64,x:f64}>>) -> !fir.ref<!fir.type<_QFTt_pair{i:i64,x:f64}>> loc(#loc9)
+    return
+  } loc(#loc10)
+}
+#loc1 = loc("derived1.f90":24:1)
+#loc2 = loc("derived1.f90":35:25)
+#loc3 = loc("derived1.f90":17:1)
+#loc4 = loc("derived1.f90":46:1)
+#loc5 = loc("derived1.f90":50:3)
+#loc6 = loc("derived1.f90":62:3)
+#loc7 = loc("derived1.f90":70:3)
+#loc8 = loc("derived1.f90":85:3)
+#loc9 = loc("derived1.f90":77:3)
+#loc10 = loc("derived1.f90":75:3)
+
+
+// CHECK-DAG: #[[INT_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[REAL4_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+// CHECK-DAG: #[[CMX8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMX_ARR:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type, baseType = #[[CMX8_TY:.*]]{{.*}}>
+// CHECK-DAG: #[[LOG_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[STR_TY:.*]] = #llvm.di_string_type
+// CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}name = "m_employee"{{.*}}>
+// CHECK-DAG: #[[MOD1:.*]] = #llvm.di_module<{{.*}}name = "t1"{{.*}}>
+// CHECK-DAG: #[[ELMA1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "house_number", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32>
+// CHECK-DAG: #[[ADDR:.*]] = #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_address"{{.*}}line = 24, scope = #[[MOD]], sizeInBits = 32, elements = #[[ELMA1]]>
+// CHECK-DAG: #[[ELMD1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "year", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32>
+// CHECK-DAG: #[[ELMD2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "month", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32, offsetInBits = 32>
+// CHECK-DAG: #[[ELMD3:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "day", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32, offsetInBits = 64>
+// CHECK-DAG: #[[DATE:.*]] = #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_date", file = #di_file, line = 17, scope = #[[MOD]], sizeInBits = 96, elements = #[[ELMD1]], #[[ELMD2]], #[[ELMD3]]>
+// CHECK-DAG: #[[ELMP1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "t_address", baseType = #[[ADDR]], sizeInBits = 32, alignInBits = 32>
+// CHECK-DAG: #[[ELMP2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "name", baseType = #[[STR_TY]], sizeInBits = 160, alignInBits = 8, offsetInBits = 32>
+// CHECK-DAG: #[[PERS:.*]] = #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_person"{{.*}}line = 35, scope = #[[MOD]], sizeInBits = 192, elements = #[[ELMP1]], #[[ELMP2]]>
+// CHECK-DAG: #[[ELME1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "t_person", baseType = #[[PERS]], sizeInBits = 192, alignInBits = 32>
+// CHECK-DAG: #[[ELME2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "hired_date", baseType = #[[DATE]], sizeInBits = 96, alignInBits = 32, offsetInBits = 192>
+// CHECK-DAG: #[[ELME3:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "monthly_salary", baseType = #[[REAL4_TY]], sizeInBits = 32, alignInBits = 32, offsetInBits = 288>
+// CHECK-DAG: #[[EMP:.*]] = #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_employee"{{.*}}line = 46, scope = #[[MOD]], sizeInBits = 320, elements = #[[ELME1]], #[[ELME2]], #[[ELME3]]>
+
+// CHECK-DAG: #[[ELM1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "age", baseType = #[[INT_TY]], sizeInBits = 32, alignInBits = 32>
+// CHECK-DAG: #[[ELM2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "points", baseType = #[[CMX_ARR]], sizeInBits = 192, alignInBits = 32, offsetInBits = 32>
+// CHECK-DAG: #[[ELM3:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "cond", baseType = #[[LOG_TY]], sizeInBits = 8, alignInBits = 8, offsetInBits = 224>
+// CHECK-DAG: #[[ELM4:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "name", baseType = #[[STR_TY]], sizeInBits = 160, alignInBits = 8, offsetInBits = 232>
+// CHECK-DAG: #[[ELM5:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "ratio", baseType = #[[REAL8_TY]], sizeInBits = 64, alignInBits = 64, offsetInBits = 448>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_t1"{{.*}}, line = 70, scope = #[[MOD1]], sizeInBits = 512, elements = #[[ELM1]], #[[ELM2]], #[[ELM3]], #[[ELM4]], #[[ELM5]]>
+
+// CHECK-DAG: #[[SP:.*]] = #llvm.di_subprogram
+// CHECK-DAG: #[[ELML1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "i", baseType = #[[INT8_TY]], sizeInBits = 64, alignInBits = 64>
+// CHECK-DAG: #[[ELML2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "x", baseType = #[[REAL8_TY]], sizeInBits = 64, alignInBits = 64, offsetInBits = 64>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_structure_type, name = "t_pair"{{.*}}line = 85, scope = #di_subprogram, sizeInBits = 128, elements = #[[ELML1]], #[[ELML2]]>
diff --git a/libc/test/src/stdio/vfscanf_test.cpp b/libc/test/src/stdio/vfscanf_test.cpp
index 7a9cbf7f12388..fa4e27582375f 100644
--- a/libc/test/src/stdio/vfscanf_test.cpp
+++ b/libc/test/src/stdio/vfscanf_test.cpp
@@ -43,8 +43,8 @@ static int call_vfscanf(::FILE *stream, const char *__restrict format, ...) {
   return ret;
 }
 
-TEST(LlvmLibcFScanfTest, WriteToFile) {
-  const char *FILENAME = "fscanf_output.test";
+TEST(LlvmLibcVFScanfTest, WriteToFile) {
+  const char *FILENAME = "vfscanf_output.test";
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
   ::FILE *file = scanf_test::fopen(FILE_PATH, "w");
   ASSERT_FALSE(file == nullptr);
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
index 4a9389fdcb41c..c0f2bad1c95af 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
@@ -1,4 +1,2 @@
 set(LIBCXX_HARDENING_MODE "fast" CACHE STRING "")
-set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS" CACHE STRING "")
-set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING" CACHE STRING "")
-set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR" CACHE STRING "")
+set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_BOUNDED_ITERATORS;_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING;_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR" CACHE STRING "")
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index b14d2cb6c7803..b8e3d05588f96 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -620,8 +620,7 @@ inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x)
     _NOEXCEPT_(!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>)
 #endif
 {
-  std::__swap_allocator(
-      __alloc(), __x.__alloc(), integral_constant<bool, __node_traits::propagate_on_container_swap::value>());
+  std::__swap_allocator(__alloc(), __x.__alloc());
   using std::swap;
   swap(__before_begin()->__next_, __x.__before_begin()->__next_);
 }
diff --git a/libcxx/include/string b/libcxx/include/string
index 6e93a6230cc2c..05d42afb7c9c3 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -827,8 +827,8 @@ public:
   // Users might provide custom allocators, and prior to C++20 we have no existing way to detect whether the allocator's
   // pointer type is contiguous (though it has to be by the Standard). Using the wrapper type ensures the iterator is
   // considered contiguous.
-  typedef __bounded_iter<__wrap_iter<pointer>> iterator;
-  typedef __bounded_iter<__wrap_iter<const_pointer>> const_iterator;
+  typedef __bounded_iter<__wrap_iter<pointer> > iterator;
+  typedef __bounded_iter<__wrap_iter<const_pointer> > const_iterator;
 #else
   typedef __wrap_iter<pointer> iterator;
   typedef __wrap_iter<const_pointer> const_iterator;
@@ -1213,7 +1213,7 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator __self_view() const _NOEXCEPT {
-    return __self_view(data(), size());
+    return __self_view(typename __self_view::__assume_valid(), data(), size());
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS basic_string&
@@ -1822,7 +1822,7 @@ public:
 
 #if _LIBCPP_STD_VER >= 20
   constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(__self_view __sv) const noexcept {
-    return __self_view(data(), size()).starts_with(__sv);
+    return __self_view(typename __self_view::__assume_valid(), data(), size()).starts_with(__sv);
   }
 
   constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(value_type __c) const noexcept {
@@ -1834,7 +1834,7 @@ public:
   }
 
   constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(__self_view __sv) const noexcept {
-    return __self_view(data(), size()).ends_with(__sv);
+    return __self_view(typename __self_view::__assume_valid(), data(), size()).ends_with(__sv);
   }
 
   constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(value_type __c) const noexcept {
@@ -1848,15 +1848,15 @@ public:
 
 #if _LIBCPP_STD_VER >= 23
   constexpr _LIBCPP_HIDE_FROM_ABI bool contains(__self_view __sv) const noexcept {
-    return __self_view(data(), size()).contains(__sv);
+    return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__sv);
   }
 
   constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept {
-    return __self_view(data(), size()).contains(__c);
+    return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__c);
   }
 
   constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* __s) const {
-    return __self_view(data(), size()).contains(__s);
+    return __self_view(typename __self_view::__assume_valid(), data(), size()).contains(__s);
   }
 #endif
 
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index 2a03ee99e9ab5..cf97e3a9be314 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -211,6 +211,7 @@ namespace std {
 #include <__functional/hash.h>
 #include <__functional/unary_function.h>
 #include <__fwd/ostream.h>
+#include <__fwd/string.h>
 #include <__fwd/string_view.h>
 #include <__iterator/bounded_iter.h>
 #include <__iterator/concepts.h>
@@ -689,6 +690,9 @@ private:
 
   const value_type* __data_;
   size_type __size_;
+
+  template <class, class, class>
+  friend class basic_string;
 };
 _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(basic_string_view);
 
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 3aa23d8fc1e24..81aab9407714c 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -404,8 +404,8 @@ public:
   // Users might provide custom allocators, and prior to C++20 we have no existing way to detect whether the allocator's
   // pointer type is contiguous (though it has to be by the Standard). Using the wrapper type ensures the iterator is
   // considered contiguous.
-  typedef __bounded_iter<__wrap_iter<pointer>> iterator;
-  typedef __bounded_iter<__wrap_iter<const_pointer>> const_iterator;
+  typedef __bounded_iter<__wrap_iter<pointer> > iterator;
+  typedef __bounded_iter<__wrap_iter<const_pointer> > const_iterator;
 #else
   typedef __wrap_iter<pointer> iterator;
   typedef __wrap_iter<const_pointer> const_iterator;
@@ -1821,8 +1821,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::swap(vector& __x)
   std::swap(this->__begin_, __x.__begin_);
   std::swap(this->__end_, __x.__end_);
   std::swap(this->__end_cap(), __x.__end_cap());
-  std::__swap_allocator(
-      this->__alloc(), __x.__alloc(), integral_constant<bool, __alloc_traits::propagate_on_container_swap::value>());
+  std::__swap_allocator(this->__alloc(), __x.__alloc());
 }
 
 template <class _Tp, class _Allocator>
@@ -2820,8 +2819,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<bool, _Allocator>::swap(vector& __x)
   std::swap(this->__begin_, __x.__begin_);
   std::swap(this->__size_, __x.__size_);
   std::swap(this->__cap(), __x.__cap());
-  std::__swap_allocator(
-      this->__alloc(), __x.__alloc(), integral_constant<bool, __alloc_traits::propagate_on_container_swap::value>());
+  std::__swap_allocator(this->__alloc(), __x.__alloc());
 }
 
 template <class _Allocator>
diff --git a/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp
index 2dc7f5c765419..db17221e515d3 100644
--- a/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp
+++ b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp
@@ -17,6 +17,7 @@
 
 #include <atomic>
 #include <cassert>
+#include <concepts>
 #include <cstddef>
 
 #include "test_macros.h"
diff --git a/libcxx/test/support/fp_compare.h b/libcxx/test/support/fp_compare.h
index 3088a211dadc3..237ee50d1f66b 100644
--- a/libcxx/test/support/fp_compare.h
+++ b/libcxx/test/support/fp_compare.h
@@ -9,16 +9,17 @@
 #ifndef SUPPORT_FP_COMPARE_H
 #define SUPPORT_FP_COMPARE_H
 
-#include <cmath>     // for std::abs
 #include <algorithm> // for std::max
 #include <cassert>
-#include <__config>
+#include <cmath> // for std::abs
+
+#include "test_macros.h"
 
 // See https://www.boost.org/doc/libs/1_70_0/libs/test/doc/html/boost_test/testing_tools/extended_comparison/floating_point/floating_points_comparison_theory.html
 
 template <typename T>
 bool fptest_close(T val, T expected, T eps) {
-  _LIBCPP_CONSTEXPR T zero = T(0);
+  TEST_CONSTEXPR T zero = T(0);
   assert(eps >= zero);
 
   // Handle the zero cases
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 6353c87c3d865..1c7085d90f0af 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -417,7 +417,7 @@ generic-hardening-mode-fast-with-abi-breaks)
     clean
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake"
     check-runtimes
-    check-abi-list
+    # Not checking ABI list since we purposefully enable ABI breaking changes
 ;;
 generic-hardening-mode-extensive)
     clean
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index f9a22b0a7a5f0..3ecd4d241f2cd 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -61,7 +61,7 @@ static StringRef getBasename(StringRef path) {
 std::string lld::toString(const coff::InputFile *file) {
   if (!file)
     return "<internal>";
-  if (file->parentName.empty() || file->kind() == coff::InputFile::ImportKind)
+  if (file->parentName.empty())
     return std::string(file->getName());
 
   return (getBasename(file->parentName) + "(" + getBasename(file->getName()) +
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 1dfff0a90f4ae..a5f155bc05bc9 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -494,20 +494,8 @@ void SymbolTable::resolveRemainingUndefines() {
     StringRef name = undef->getName();
 
     // A weak alias may have been resolved, so check for that.
-    if (Defined *d = undef->getWeakAlias()) {
-      // We want to replace Sym with D. However, we can't just blindly
-      // copy sizeof(SymbolUnion) bytes from D to Sym because D may be an
-      // internal symbol, and internal symbols are stored as "unparented"
-      // Symbols. For that reason we need to check which type of symbol we
-      // are dealing with and copy the correct number of bytes.
-      if (isa<DefinedRegular>(d))
-        memcpy(sym, d, sizeof(DefinedRegular));
-      else if (isa<DefinedAbsolute>(d))
-        memcpy(sym, d, sizeof(DefinedAbsolute));
-      else
-        memcpy(sym, d, sizeof(SymbolUnion));
+    if (undef->resolveWeakAlias())
       continue;
-    }
 
     // If we can resolve a symbol by removing __imp_ prefix, do that.
     // This odd rule is for compatibility with MSVC linker.
diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp
index ff8ad1e619116..b098abb80d6f1 100644
--- a/lld/COFF/Symbols.cpp
+++ b/lld/COFF/Symbols.cpp
@@ -136,6 +136,29 @@ Defined *Undefined::getWeakAlias() {
   return nullptr;
 }
 
+bool Undefined::resolveWeakAlias() {
+  Defined *d = getWeakAlias();
+  if (!d)
+    return false;
+
+  // We want to replace Sym with D. However, we can't just blindly
+  // copy sizeof(SymbolUnion) bytes from D to Sym because D may be an
+  // internal symbol, and internal symbols are stored as "unparented"
+  // Symbols. For that reason we need to check which type of symbol we
+  // are dealing with and copy the correct number of bytes.
+  StringRef name = getName();
+  if (isa<DefinedRegular>(d))
+    memcpy(this, d, sizeof(DefinedRegular));
+  else if (isa<DefinedAbsolute>(d))
+    memcpy(this, d, sizeof(DefinedAbsolute));
+  else
+    memcpy(this, d, sizeof(SymbolUnion));
+
+  nameData = name.data();
+  nameSize = name.size();
+  return true;
+}
+
 MemoryBufferRef LazyArchive::getMemberBuffer() {
   Archive::Child c =
       CHECK(sym.getMember(), "could not get the member for symbol " +
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index 56b137d56873a..c427a062dc82b 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -341,6 +341,9 @@ class Undefined : public Symbol {
   // symbol by searching the chain of fallback symbols. Returns the symbol if
   // successful, otherwise returns null.
   Defined *getWeakAlias();
+
+  // If this symbol is external weak, replace this object with aliased symbol.
+  bool resolveWeakAlias();
 };
 
 // Windows-specific classes.
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 0360e186ecf0c..35e0f98926ee8 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -2062,6 +2062,24 @@ void Writer::createECChunks() {
     if (auto thunk = dyn_cast<ECExportThunkChunk>(sym->getChunk())) {
       hexpthkSec->addChunk(thunk);
       exportThunks.push_back({thunk, thunk->target});
+    } else if (auto def = dyn_cast<DefinedRegular>(sym)) {
+      // Allow section chunk to be treated as an export thunk if it looks like
+      // one.
+      SectionChunk *chunk = def->getChunk();
+      if (!chunk->live || chunk->getMachine() != AMD64)
+        continue;
+      assert(sym->getName().starts_with("EXP+"));
+      StringRef targetName = sym->getName().substr(strlen("EXP+"));
+      // If EXP+#foo is an export thunk of a hybrid patchable function,
+      // we should use the #foo$hp_target symbol as the redirection target.
+      // First, try to look up the $hp_target symbol. If it can't be found,
+      // assume it's a regular function and look for #foo instead.
+      Symbol *targetSym = ctx.symtab.find((targetName + "$hp_target").str());
+      if (!targetSym)
+        targetSym = ctx.symtab.find(targetName);
+      Defined *t = dyn_cast_or_null<Defined>(targetSym);
+      if (t && isArm64EC(t->getChunk()->getMachine()))
+        exportThunks.push_back({chunk, t});
     }
   }
 
diff --git a/lld/test/COFF/arm64ec-cust-export-thunk.s b/lld/test/COFF/arm64ec-cust-export-thunk.s
new file mode 100644
index 0000000000000..e56f34c875217
--- /dev/null
+++ b/lld/test/COFF/arm64ec-cust-export-thunk.s
@@ -0,0 +1,82 @@
+# REQUIRES: aarch64, x86
+# RUN: split-file %s %t.dir && cd %t.dir
+
+# Test that metadata is generated when a custom export thunk is supplied.
+
+# RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func.s -o func.obj
+# RUN: llvm-mc -filetype=obj -triple=arm64ec-windows hp-func.s -o hp-func.obj
+# RUN: llvm-mc -filetype=obj -triple=x86_64-windows thunk.s -o thunk.obj
+# RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+
+# RUN: lld-link -out:out.dll -machine:arm64ec func.obj thunk.obj loadconfig-arm64ec.obj -dll -noentry "-export:#func,EXPORTAS,func"
+
+# RUN: llvm-objdump -d out.dll | FileCheck --check-prefixes=DISASM,DISASM-EXP %s
+# DISASM:      Disassembly of section .text:
+# DISASM-EMPTY:
+# DISASM-NEXT: 0000000180001000 <.text>:
+# DISASM-NEXT: 180001000: 52800040     mov     w0, #0x2                // =2
+# DISASM-NEXT: 180001004: d65f03c0     ret
+# DISASM-NEXT:                 ...
+# DISASM-EXP-EMPTY:
+# DISASM-EXP-NEXT: 0000000180002000 <func>:
+# DISASM-NEXT: 180002000: b8 03 00 00 00               movl    $0x3, %eax
+# DISASM-NEXT: 180002005: c3                           retq
+
+# RUN: llvm-objdump -p out.dll | FileCheck --check-prefix=EXPORT %s
+# EXPORT:      Ordinal      RVA  Name
+# EXPORT-NEXT:       1   0x2000  func
+
+# RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=CHPE %s
+# CHPE:       CodeMap [
+# CHPE-NEXT:    0x1000 - 0x1008  ARM64EC
+# CHPE-NEXT:    0x2000 - 0x2006  X64
+# CHPE-NEXT:  ]
+# CHPE-NEXT:  CodeRangesToEntryPoints [
+# CHPE-NEXT:    0x2000 - 0x2006 -> 0x2000
+# CHPE-NEXT:  ]
+# CHPE-NEXT:  RedirectionMetadata [
+# CHPE-NEXT:    0x2000 -> 0x1000
+# CHPE-NEXT:  ]
+
+# RUN: lld-link -out:out2.dll -machine:arm64ec hp-func.obj thunk.obj loadconfig-arm64ec.obj -dll -noentry
+# RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s
+# RUN: llvm-readobj --coff-load-config out2.dll | FileCheck --check-prefix=CHPE %s
+
+#--- func.s
+    .globl "#func"
+    .p2align 2, 0x0
+"#func":
+    mov w0, #2
+    ret
+
+#--- hp-func.s
+    .section .text,"xr",discard,"#func$hp_target"
+    .globl "#func$hp_target"
+    .p2align 2, 0x0
+"#func$hp_target":
+    mov w0, #2
+    ret
+
+    .def "EXP+#func"
+    .scl 2
+    .type 32
+    .endef
+    .weak func
+.set func, "EXP+#func"
+    .weak "#func"
+.set "#func", "#func$hp_target"
+
+    .data
+    .rva func
+
+#--- thunk.s
+    .def "EXP+#func"
+    .scl 2
+    .type 32
+    .endef
+    .section .wowthk$aa,"xr",discard,"EXP+#func"
+    .globl "EXP+#func"
+    .p2align 2, 0x0
+"EXP+#func":
+    movl $3, %eax
+    retq
diff --git a/lld/test/COFF/delayimports-error.test b/lld/test/COFF/delayimports-error.test
index cced9fb65e614..5f45083cf5e61 100644
--- a/lld/test/COFF/delayimports-error.test
+++ b/lld/test/COFF/delayimports-error.test
@@ -8,7 +8,7 @@
 # RUN:   /alternatename:__delayLoadHelper2=main /opt:noref >& %t.log
 # RUN: FileCheck %s < %t.log
 
-# CHECK: cannot delay-load foo.dll due to import of data: __declspec(dllimport) datasym
+# CHECK: cannot delay-load foo.lib(foo.dll) due to import of data: __declspec(dllimport) datasym
 
 --- !COFF
 header:
diff --git a/lld/test/COFF/duplicate.test b/lld/test/COFF/duplicate.test
index 76c88b070ff37..11e5aa06318fe 100644
--- a/lld/test/COFF/duplicate.test
+++ b/lld/test/COFF/duplicate.test
@@ -13,5 +13,5 @@ RUN: not lld-link /out:gamma.exe /subsystem:console /entry:mainCRTStartup gamma.
 
 CHECK-GAMMA: error: duplicate symbol: __declspec(dllimport) f
 CHECK-GAMMA: defined at {{.*}}gamma.obj
-CHECK-GAMMA: defined at alpha.dll
+CHECK-GAMMA: defined at alpha.lib(alpha.dll)
 
diff --git a/lld/test/COFF/implib-machine.s b/lld/test/COFF/implib-machine.s
index 32deff0fc25f8..92f01bbc72799 100644
--- a/lld/test/COFF/implib-machine.s
+++ b/lld/test/COFF/implib-machine.s
@@ -6,10 +6,10 @@
 # RUN: llvm-mc -triple x86_64-windows-msvc %t.dir/test.s -filetype=obj -o %t.dir/test64.obj
 
 # RUN: not lld-link -dll -noentry -out:%t32.dll %t.dir/test32.obj %t.dir/test64.lib 2>&1 | FileCheck --check-prefix=ERR32 %s
-# ERR32: error: test.dll: machine type x64 conflicts with x86
+# ERR32: error: test64.lib(test.dll): machine type x64 conflicts with x86
 
 # RUN: not lld-link -dll -noentry -out:%t64.dll %t.dir/test64.obj %t.dir/test32.lib 2>&1 | FileCheck --check-prefix=ERR64 %s
-# ERR64: error: test.dll: machine type x86 conflicts with x64
+# ERR64: error: test32.lib(test.dll): machine type x86 conflicts with x64
 
 #--- test.s
         .def     @feat.00;
diff --git a/lld/test/COFF/symtab.test b/lld/test/COFF/symtab.test
index 45e8ed39737a4..6ef2b4d47503c 100644
--- a/lld/test/COFF/symtab.test
+++ b/lld/test/COFF/symtab.test
@@ -86,6 +86,15 @@
 # CHECK-NEXT:     StorageClass: External (0x2)
 # CHECK-NEXT:     AuxSymbolCount: 0
 # CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: weak_main
+# CHECK-NEXT:     Value: 0
+# CHECK-NEXT:     Section: .text (1)
+# CHECK-NEXT:     BaseType: Null (0x0)
+# CHECK-NEXT:     ComplexType: Null (0x0)
+# CHECK-NEXT:     StorageClass: External (0x2)
+# CHECK-NEXT:     AuxSymbolCount: 0
+# CHECK-NEXT:   }
 # CHECK-NEXT: ]
 
 # NO:       Symbols [
@@ -237,4 +246,13 @@ symbols:
     SimpleType:      IMAGE_SYM_TYPE_NULL
     ComplexType:     IMAGE_SYM_DTYPE_NULL
     StorageClass:    IMAGE_SYM_CLASS_LABEL
+  - Name:            weak_main
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_WEAK_EXTERNAL
+    WeakExternal:
+      TagIndex:        10
+      Characteristics: IMAGE_WEAK_EXTERN_SEARCH_ALIAS
 ...
diff --git a/lldb/include/lldb/Host/Socket.h b/lldb/include/lldb/Host/Socket.h
index 573c881f727d8..304a91bdf6741 100644
--- a/lldb/include/lldb/Host/Socket.h
+++ b/lldb/include/lldb/Host/Socket.h
@@ -19,6 +19,7 @@
 #include "lldb/Utility/Status.h"
 
 #ifdef _WIN32
+#include "lldb/Host/Pipe.h"
 #include "lldb/Host/windows/windows.h"
 #include <winsock2.h>
 #include <ws2tcpip.h>
@@ -32,12 +33,35 @@ namespace lldb_private {
 
 #if defined(_WIN32)
 typedef SOCKET NativeSocket;
+typedef lldb::pipe_t shared_fd_t;
 #else
 typedef int NativeSocket;
+typedef NativeSocket shared_fd_t;
 #endif
+class Socket;
 class TCPSocket;
 class UDPSocket;
 
+class SharedSocket {
+public:
+  static const shared_fd_t kInvalidFD;
+
+  SharedSocket(const Socket *socket, Status &error);
+
+  shared_fd_t GetSendableFD() { return m_fd; }
+
+  Status CompleteSending(lldb::pid_t child_pid);
+
+  static Status GetNativeSocket(shared_fd_t fd, NativeSocket &socket);
+
+private:
+#ifdef _WIN32
+  Pipe m_socket_pipe;
+  NativeSocket m_socket;
+#endif
+  shared_fd_t m_fd;
+};
+
 class Socket : public IOObject {
 public:
   enum SocketProtocol {
diff --git a/lldb/include/lldb/Utility/Status.h b/lldb/include/lldb/Utility/Status.h
index fa5768141fa45..a80ebe89e562d 100644
--- a/lldb/include/lldb/Utility/Status.h
+++ b/lldb/include/lldb/Utility/Status.h
@@ -181,11 +181,12 @@ class Status {
   bool Success() const;
 
 protected:
-  /// Member variables
-  ValueType m_code = 0; ///< Status code as an integer value.
-  lldb::ErrorType m_type =
-      lldb::eErrorTypeInvalid;  ///< The type of the above error code.
-  mutable std::string m_string; ///< A string representation of the error code.
+  /// Status code as an integer value.
+  ValueType m_code = 0;
+  /// The type of the above error code.
+  lldb::ErrorType m_type = lldb::eErrorTypeInvalid;
+  /// A string representation of the error code.
+  mutable std::string m_string;
 private:
   explicit Status(const llvm::formatv_object_base &payload) {
     SetErrorToGenericError();
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 874383a13e2bb..b095171d8fd1a 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -1099,6 +1099,20 @@ def terminate(self):
         self.send.close()
         # self.recv.close()
 
+    def request_setInstructionBreakpoints(self, memory_reference=[]):
+        breakpoints = []
+        for i in memory_reference:
+            args_dict = {
+                "instructionReference": i,
+            }
+            breakpoints.append(args_dict)
+        args_dict = {"breakpoints": breakpoints}
+        command_dict = {
+            "command": "setInstructionBreakpoints",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self.send_recv(command_dict)
 
 class DebugAdaptorServer(DebugCommunication):
     def __init__(
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 86eba355da83d..709b7aff11d7f 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -81,7 +81,10 @@ def verify_breakpoint_hit(self, breakpoint_ids):
                 body = stopped_event["body"]
                 if "reason" not in body:
                     continue
-                if body["reason"] != "breakpoint":
+                if (
+                    body["reason"] != "breakpoint"
+                    and body["reason"] != "instruction breakpoint"
+                ):
                     continue
                 if "description" not in body:
                     continue
diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index 7364a12280cfd..aabd562b0557c 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -56,10 +56,12 @@ using namespace lldb_private;
 typedef const char *set_socket_option_arg_type;
 typedef char *get_socket_option_arg_type;
 const NativeSocket Socket::kInvalidSocketValue = INVALID_SOCKET;
+const shared_fd_t SharedSocket::kInvalidFD = LLDB_INVALID_PIPE;
 #else  // #if defined(_WIN32)
 typedef const void *set_socket_option_arg_type;
 typedef void *get_socket_option_arg_type;
 const NativeSocket Socket::kInvalidSocketValue = -1;
+const shared_fd_t SharedSocket::kInvalidFD = Socket::kInvalidSocketValue;
 #endif // #if defined(_WIN32)
 
 static bool IsInterrupted() {
@@ -70,6 +72,80 @@ static bool IsInterrupted() {
 #endif
 }
 
+SharedSocket::SharedSocket(const Socket *socket, Status &error) {
+#ifdef _WIN32
+  m_socket = socket->GetNativeSocket();
+  m_fd = kInvalidFD;
+
+  // Create a pipe to transfer WSAPROTOCOL_INFO to the child process.
+  error = m_socket_pipe.CreateNew(true);
+  if (error.Fail())
+    return;
+
+  m_fd = m_socket_pipe.GetReadPipe();
+#else
+  m_fd = socket->GetNativeSocket();
+  error = Status();
+#endif
+}
+
+Status SharedSocket::CompleteSending(lldb::pid_t child_pid) {
+#ifdef _WIN32
+  // Transfer WSAPROTOCOL_INFO to the child process.
+  m_socket_pipe.CloseReadFileDescriptor();
+
+  WSAPROTOCOL_INFO protocol_info;
+  if (::WSADuplicateSocket(m_socket, child_pid, &protocol_info) ==
+      SOCKET_ERROR) {
+    int last_error = ::WSAGetLastError();
+    return Status("WSADuplicateSocket() failed, error: %d", last_error);
+  }
+
+  size_t num_bytes;
+  Status error =
+      m_socket_pipe.WriteWithTimeout(&protocol_info, sizeof(protocol_info),
+                                     std::chrono::seconds(10), num_bytes);
+  if (error.Fail())
+    return error;
+  if (num_bytes != sizeof(protocol_info))
+    return Status("WriteWithTimeout(WSAPROTOCOL_INFO) failed: %d bytes",
+                  num_bytes);
+#endif
+  return Status();
+}
+
+Status SharedSocket::GetNativeSocket(shared_fd_t fd, NativeSocket &socket) {
+#ifdef _WIN32
+  socket = Socket::kInvalidSocketValue;
+  // Read WSAPROTOCOL_INFO from the parent process and create NativeSocket.
+  WSAPROTOCOL_INFO protocol_info;
+  {
+    Pipe socket_pipe(fd, LLDB_INVALID_PIPE);
+    size_t num_bytes;
+    Status error =
+        socket_pipe.ReadWithTimeout(&protocol_info, sizeof(protocol_info),
+                                    std::chrono::seconds(10), num_bytes);
+    if (error.Fail())
+      return error;
+    if (num_bytes != sizeof(protocol_info)) {
+      return Status(
+          "socket_pipe.ReadWithTimeout(WSAPROTOCOL_INFO) failed: % d bytes",
+          num_bytes);
+    }
+  }
+  socket = ::WSASocket(FROM_PROTOCOL_INFO, FROM_PROTOCOL_INFO,
+                       FROM_PROTOCOL_INFO, &protocol_info, 0, 0);
+  if (socket == INVALID_SOCKET) {
+    return Status("WSASocket(FROM_PROTOCOL_INFO) failed: error %d",
+                  ::WSAGetLastError());
+  }
+  return Status();
+#else
+  socket = fd;
+  return Status();
+#endif
+}
+
 struct SocketScheme {
   const char *m_scheme;
   const Socket::SocketProtocol m_protocol;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangHost.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangHost.cpp
index 6064c02c7fd67..6de851081598f 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangHost.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangHost.cpp
@@ -53,7 +53,7 @@ static bool DefaultComputeClangResourceDirectory(FileSpec &lldb_shlib_spec,
   std::string raw_path = lldb_shlib_spec.GetPath();
   llvm::StringRef parent_dir = llvm::sys::path::parent_path(raw_path);
   static const std::string clang_resource_path =
-      clang::driver::Driver::GetResourcesPath("bin/lldb", CLANG_RESOURCE_DIR);
+      clang::driver::Driver::GetResourcesPath("bin/lldb");
 
   static const llvm::StringRef kResourceDirSuffixes[] = {
       // LLVM.org's build of LLDB uses the clang resource directory placed
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 7e0cf36d0de1b..d887d81912b27 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -2737,10 +2737,19 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
   if (results.AlreadySearched(this))
     return;
 
+  auto type_basename = query.GetTypeBasename();
+
+  Log *log = GetLog(DWARFLog::Lookups);
+  if (log) {
+    GetObjectFile()->GetModule()->LogMessage(
+        log, "SymbolFileDWARF::FindTypes(type_basename=\"{0}\")",
+        type_basename);
+  }
+
   std::lock_guard<std::recursive_mutex> guard(GetModuleMutex());
 
   bool have_index_match = false;
-  m_index->GetTypes(query.GetTypeBasename(), [&](DWARFDIE die) {
+  m_index->GetTypes(type_basename, [&](DWARFDIE die) {
     // Check the language, but only if we have a language filter.
     if (query.HasLanguage()) {
       if (!query.LanguageMatches(GetLanguageFamily(*die.GetCU())))
@@ -2779,8 +2788,14 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
     return !results.Done(query); // Keep iterating if we aren't done.
   });
 
-  if (results.Done(query))
+  if (results.Done(query)) {
+    if (log) {
+      GetObjectFile()->GetModule()->LogMessage(
+          log, "SymbolFileDWARF::FindTypes(type_basename=\"{0}\") => {1}",
+          type_basename, results.GetTypeMap().GetSize());
+    }
     return;
+  }
 
   // With -gsimple-template-names, a templated type's DW_AT_name will not
   // contain the template parameters. Try again stripping '<' and anything
@@ -2795,10 +2810,10 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
     // it trims any context items down by removing template parameter names.
     TypeQuery query_simple(query);
     if (UpdateCompilerContextForSimpleTemplateNames(query_simple)) {
-
+      auto type_basename_simple = query_simple.GetTypeBasename();
       // Copy our match's context and update the basename we are looking for
       // so we can use this only to compare the context correctly.
-      m_index->GetTypes(query_simple.GetTypeBasename(), [&](DWARFDIE die) {
+      m_index->GetTypes(type_basename_simple, [&](DWARFDIE die) {
         // Check the language, but only if we have a language filter.
         if (query.HasLanguage()) {
           if (!query.LanguageMatches(GetLanguageFamily(*die.GetCU())))
@@ -2834,8 +2849,17 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
         }
         return !results.Done(query); // Keep iterating if we aren't done.
       });
-      if (results.Done(query))
+      if (results.Done(query)) {
+        if (log) {
+          GetObjectFile()->GetModule()->LogMessage(
+              log,
+              "SymbolFileDWARF::FindTypes(type_basename=\"{0}\") => {1} "
+              "(simplified as \"{2}\")",
+              type_basename, results.GetTypeMap().GetSize(),
+              type_basename_simple);
+        }
         return;
+      }
     }
   }
 
@@ -2847,8 +2871,11 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) {
   for (const auto &pair : m_external_type_modules) {
     if (ModuleSP external_module_sp = pair.second) {
       external_module_sp->FindTypes(query, results);
-      if (results.Done(query))
+      if (results.Done(query)) {
+        // We don't log the results here as they are already logged in the
+        // nested FindTypes call
         return;
+      }
     }
   }
 }
diff --git a/lldb/test/API/tools/lldb-dap/instruction-breakpoint/Makefile b/lldb/test/API/tools/lldb-dap/instruction-breakpoint/Makefile
new file mode 100644
index 0000000000000..697527c4e5522
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/instruction-breakpoint/Makefile
@@ -0,0 +1,6 @@
+CXX_SOURCES := main-copy.cpp
+CXXFLAGS_EXTRAS := -O0 -g
+include Makefile.rules
+
+main-copy.cpp: main.cpp
+	cp -f $< $@
diff --git a/lldb/test/API/tools/lldb-dap/instruction-breakpoint/TestDAP_instruction_breakpoint.py b/lldb/test/API/tools/lldb-dap/instruction-breakpoint/TestDAP_instruction_breakpoint.py
new file mode 100644
index 0000000000000..3862be7bfa34c
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/instruction-breakpoint/TestDAP_instruction_breakpoint.py
@@ -0,0 +1,98 @@
+import dap_server
+import shutil
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+import lldbdap_testcase
+import os
+import lldb
+
+
+class TestDAP_InstructionBreakpointTestCase(lldbdap_testcase.DAPTestCaseBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        lldbdap_testcase.DAPTestCaseBase.setUp(self)
+
+        self.main_basename = "main-copy.cpp"
+        self.main_path = os.path.realpath(self.getBuildArtifact(self.main_basename))
+
+    @skipIfWindows
+    def test_instruction_breakpoint(self):
+        self.build()
+        self.instruction_breakpoint_test()
+
+    def instruction_breakpoint_test(self):
+        """Sample test to ensure SBFrame::Disassemble produces SOME output"""
+        # Create a target by the debugger.
+        target = self.createTestTarget()
+
+        main_line = line_number("main.cpp", "breakpoint 1")
+
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+
+        # Set source breakpoint 1
+        response = self.dap_server.request_setBreakpoints(self.main_path, [main_line])
+        breakpoints = response["body"]["breakpoints"]
+        self.assertEquals(len(breakpoints), 1)
+        breakpoint = breakpoints[0]
+        self.assertEqual(
+            breakpoint["line"], main_line, "incorrect breakpoint source line"
+        )
+        self.assertTrue(breakpoint["verified"], "breakpoint is not verified")
+        self.assertEqual(
+            self.main_basename, breakpoint["source"]["name"], "incorrect source name"
+        )
+        self.assertEqual(
+            self.main_path, breakpoint["source"]["path"], "incorrect source file path"
+        )
+        other_breakpoint_id = breakpoint["id"]
+
+        # Continue and then verifiy the breakpoint
+        self.dap_server.request_continue()
+        self.verify_breakpoint_hit([other_breakpoint_id])
+
+        # now we check the stack trace making sure that we got mapped source paths
+        frames = self.dap_server.request_stackTrace()["body"]["stackFrames"]
+        intstructionPointerReference = []
+        setIntstructionBreakpoints = []
+        intstructionPointerReference.append(frames[0]["instructionPointerReference"])
+        self.assertEqual(
+            frames[0]["source"]["name"], self.main_basename, "incorrect source name"
+        )
+        self.assertEqual(
+            frames[0]["source"]["path"], self.main_path, "incorrect source file path"
+        )
+
+        # Check disassembly view
+        instruction = self.disassemble(frameIndex=0)
+        self.assertEqual(
+            instruction["address"],
+            intstructionPointerReference[0],
+            "current breakpoint reference is not in the disaasembly view",
+        )
+
+        # Get next instruction address to set instruction breakpoint
+        disassembled_instruction_list = self.dap_server.disassembled_instructions
+        instruction_addr_list = list(disassembled_instruction_list.keys())
+        index = instruction_addr_list.index(intstructionPointerReference[0])
+        if len(instruction_addr_list) >= (index + 1):
+            next_inst_addr = instruction_addr_list[index + 1]
+            if len(next_inst_addr) > 2:
+                setIntstructionBreakpoints.append(next_inst_addr)
+                instruction_breakpoint_response = (
+                    self.dap_server.request_setInstructionBreakpoints(
+                        setIntstructionBreakpoints
+                    )
+                )
+                inst_breakpoints = instruction_breakpoint_response["body"][
+                    "breakpoints"
+                ]
+                self.assertEqual(
+                    inst_breakpoints[0]["instructionReference"],
+                    next_inst_addr,
+                    "Instruction breakpoint has not been resolved or failed to relocate the instruction breakpoint",
+                )
+                self.dap_server.request_continue()
+                self.verify_breakpoint_hit([inst_breakpoints[0]["id"]])
diff --git a/lldb/test/API/tools/lldb-dap/instruction-breakpoint/main.cpp b/lldb/test/API/tools/lldb-dap/instruction-breakpoint/main.cpp
new file mode 100644
index 0000000000000..3c710d6417157
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/instruction-breakpoint/main.cpp
@@ -0,0 +1,18 @@
+#include <stdio.h>
+#include <unistd.h>
+
+int function(int x) {
+
+  if (x == 0) // breakpoint 1
+    return x;
+
+  if ((x % 2) != 0)
+    return x;
+  else
+    return function(x - 1) + x;
+}
+
+int main(int argc, char const *argv[]) {
+  int n = function(2);
+  return n;
+}
\ No newline at end of file
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index f8f0d86453f58..d68098bf7b326 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -38,6 +38,7 @@ add_lldb_tool(lldb-dap
   SourceBreakpoint.cpp
   DAP.cpp
   Watchpoint.cpp
+  InstructionBreakpoint.cpp
 
   LINK_LIBS
     liblldb
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 57b93c28ce930..6012ee52110b7 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -68,7 +68,7 @@ static std::string capitalize(llvm::StringRef str) {
 
 void DAP::PopulateExceptionBreakpoints() {
   llvm::call_once(init_exception_breakpoints_flag, [this]() {
-    exception_breakpoints = std::vector<ExceptionBreakpoint> {};
+    exception_breakpoints = std::vector<ExceptionBreakpoint>{};
 
     if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
       exception_breakpoints->emplace_back("cpp_catch", "C++ Catch",
@@ -996,4 +996,32 @@ void DAP::SetThreadFormat(llvm::StringRef format) {
   }
 }
 
+InstructionBreakpoint *
+DAP::GetInstructionBreakpoint(const lldb::break_id_t bp_id) {
+  for (auto &bp : instruction_breakpoints) {
+    if (bp.second.id == bp_id)
+      return &bp.second;
+  }
+  return nullptr;
+}
+
+InstructionBreakpoint *
+DAP::GetInstructionBPFromStopReason(lldb::SBThread &thread) {
+  const auto num = thread.GetStopReasonDataCount();
+  InstructionBreakpoint *inst_bp = nullptr;
+  for (size_t i = 0; i < num; i += 2) {
+    // thread.GetStopReasonDataAtIndex(i) will return the bp ID and
+    // thread.GetStopReasonDataAtIndex(i+1) will return the location
+    // within that breakpoint. We only care about the bp ID so we can
+    // see if this is an instruction breakpoint that is getting hit.
+    lldb::break_id_t bp_id = thread.GetStopReasonDataAtIndex(i);
+    inst_bp = GetInstructionBreakpoint(bp_id);
+    // If any breakpoint is not an instruction breakpoint, then stop and
+    // report this as a normal breakpoint
+    if (inst_bp == nullptr)
+      return nullptr;
+  }
+  return inst_bp;
+}
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index 0fc77ac1e8168..f4fdec6e895ad 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -54,6 +54,7 @@
 #include "ExceptionBreakpoint.h"
 #include "FunctionBreakpoint.h"
 #include "IOStream.h"
+#include "InstructionBreakpoint.h"
 #include "ProgressEvent.h"
 #include "RunInTerminal.h"
 #include "SourceBreakpoint.h"
@@ -68,6 +69,8 @@ namespace lldb_dap {
 
 typedef llvm::DenseMap<uint32_t, SourceBreakpoint> SourceBreakpointMap;
 typedef llvm::StringMap<FunctionBreakpoint> FunctionBreakpointMap;
+typedef llvm::DenseMap<lldb::addr_t, InstructionBreakpoint>
+    InstructionBreakpointMap;
 
 enum class OutputType { Console, Stdout, Stderr, Telemetry };
 
@@ -160,6 +163,7 @@ struct DAP {
   std::unique_ptr<std::ofstream> log;
   llvm::StringMap<SourceBreakpointMap> source_breakpoints;
   FunctionBreakpointMap function_breakpoints;
+  InstructionBreakpointMap instruction_breakpoints;
   std::optional<std::vector<ExceptionBreakpoint>> exception_breakpoints;
   llvm::once_flag init_exception_breakpoints_flag;
   std::vector<std::string> pre_init_commands;
@@ -334,6 +338,10 @@ struct DAP {
 
   void SetThreadFormat(llvm::StringRef format);
 
+  InstructionBreakpoint *GetInstructionBreakpoint(const lldb::break_id_t bp_id);
+
+  InstructionBreakpoint *GetInstructionBPFromStopReason(lldb::SBThread &thread);
+
 private:
   // Send the JSON in "json_str" to the "out" stream. Correctly send the
   // "Content-Length:" field followed by the length, followed by the raw
diff --git a/lldb/tools/lldb-dap/DAPForward.h b/lldb/tools/lldb-dap/DAPForward.h
index 8c79488fae8db..159d999a63c82 100644
--- a/lldb/tools/lldb-dap/DAPForward.h
+++ b/lldb/tools/lldb-dap/DAPForward.h
@@ -15,6 +15,7 @@ struct ExceptionBreakpoint;
 struct FunctionBreakpoint;
 struct SourceBreakpoint;
 struct Watchpoint;
+struct InstructionBreakpoint;
 } // namespace lldb_dap
 
 namespace lldb {
diff --git a/lldb/tools/lldb-dap/InstructionBreakpoint.cpp b/lldb/tools/lldb-dap/InstructionBreakpoint.cpp
new file mode 100644
index 0000000000000..de4f6f5d86717
--- /dev/null
+++ b/lldb/tools/lldb-dap/InstructionBreakpoint.cpp
@@ -0,0 +1,28 @@
+//===-- InstructionBreakpoint.cpp ------------------------------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstructionBreakpoint.h"
+#include "DAP.h"
+
+namespace lldb_dap {
+
+// Instruction Breakpoint
+InstructionBreakpoint::InstructionBreakpoint(const llvm::json::Object &obj)
+    : Breakpoint(obj), instructionAddressReference(LLDB_INVALID_ADDRESS), id(0),
+      offset(GetSigned(obj, "offset", 0)) {
+  GetString(obj, "instructionReference")
+      .getAsInteger(0, instructionAddressReference);
+  instructionAddressReference += offset;
+}
+
+void InstructionBreakpoint::SetInstructionBreakpoint() {
+  bp = g_dap.target.BreakpointCreateByAddress(instructionAddressReference);
+  id = bp.GetID();
+}
+} // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/InstructionBreakpoint.h b/lldb/tools/lldb-dap/InstructionBreakpoint.h
new file mode 100644
index 0000000000000..cf1516f46e955
--- /dev/null
+++ b/lldb/tools/lldb-dap/InstructionBreakpoint.h
@@ -0,0 +1,36 @@
+//===-- InstructionBreakpoint.h --------------------------------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TOOLS_LLDB_DAP_INSTRUCTIONBREAKPOINT_H
+#define LLDB_TOOLS_LLDB_DAP_INSTRUCTIONBREAKPOINT_H
+
+#include "Breakpoint.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace lldb_dap {
+
+// Instruction Breakpoint
+struct InstructionBreakpoint : public Breakpoint {
+
+  lldb::addr_t instructionAddressReference;
+  int32_t id;
+  int32_t offset;
+
+  InstructionBreakpoint()
+      : Breakpoint(), instructionAddressReference(LLDB_INVALID_ADDRESS), id(0),
+        offset(0) {}
+  InstructionBreakpoint(const llvm::json::Object &obj);
+
+  // Set instruction breakpoint in LLDB as a new breakpoint
+  void SetInstructionBreakpoint();
+};
+
+} // namespace lldb_dap
+
+#endif
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index c080fd395b728..7338e7cf41eb0 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -769,6 +769,70 @@ llvm::json::Value CreateStackFrame(lldb::SBFrame &frame) {
   return llvm::json::Value(std::move(object));
 }
 
+// Response to `setInstructionBreakpoints` request.
+// "Breakpoint": {
+//   "type": "object",
+//   "description": "Response to `setInstructionBreakpoints` request.",
+//   "properties": {
+//     "id": {
+//       "type": "number",
+//       "description": "The identifier for the breakpoint. It is needed if
+//       breakpoint events are used to update or remove breakpoints."
+//     },
+//     "verified": {
+//       "type": "boolean",
+//       "description": "If true, the breakpoint could be set (but not
+//       necessarily at the desired location."
+//     },
+//     "message": {
+//       "type": "string",
+//       "description": "A message about the state of the breakpoint.
+//       This is shown to the user and can be used to explain why a breakpoint
+//       could not be verified."
+//     },
+//     "source": {
+//       "type": "Source",
+//       "description": "The source where the breakpoint is located."
+//     },
+//     "line": {
+//       "type": "number",
+//       "description": "The start line of the actual range covered by the
+//       breakpoint."
+//     },
+//     "column": {
+//       "type": "number",
+//       "description": "The start column of the actual range covered by the
+//       breakpoint."
+//     },
+//     "endLine": {
+//       "type": "number",
+//       "description": "The end line of the actual range covered by the
+//       breakpoint."
+//     },
+//     "endColumn": {
+//       "type": "number",
+//       "description": "The end column of the actual range covered by the
+//       breakpoint. If no end line is given, then the end column is assumed to
+//       be in the start line."
+//     },
+//     "instructionReference": {
+//       "type": "string",
+//       "description": "A memory reference to where the breakpoint is set."
+//     },
+//     "offset": {
+//       "type": "number",
+//       "description": "The offset from the instruction reference.
+//       This can be negative."
+//     },
+//   },
+//   "required": [ "id", "verified", "line"]
+// }
+llvm::json::Value CreateInstructionBreakpoint(BreakpointBase *ibp) {
+  llvm::json::Object object;
+  ibp->CreateJsonObject(object);
+  return llvm::json::Value(std::move(object));
+}
+
 // "Thread": {
 //   "type": "object",
 //   "description": "A Thread",
@@ -893,7 +957,13 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread,
       body.try_emplace("reason", "exception");
       EmplaceSafeString(body, "description", exc_bp->label);
     } else {
-      body.try_emplace("reason", "breakpoint");
+      InstructionBreakpoint *inst_bp =
+          g_dap.GetInstructionBPFromStopReason(thread);
+      if (inst_bp) {
+        body.try_emplace("reason", "instruction breakpoint");
+      } else {
+        body.try_emplace("reason", "breakpoint");
+      }
       lldb::break_id_t bp_id = thread.GetStopReasonDataAtIndex(0);
       lldb::break_id_t bp_loc_id = thread.GetStopReasonDataAtIndex(1);
       std::string desc_str =
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 1515f5ba2e5f4..b6356630b7268 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -322,6 +322,17 @@ llvm::json::Value CreateSource(llvm::StringRef source_path);
 ///     definition outlined by Microsoft.
 llvm::json::Value CreateStackFrame(lldb::SBFrame &frame);
 
+/// Create a "instruction" object for a LLDB disassemble object as described in
+/// the Visual Studio Code debug adaptor definition.
+///
+/// \param[in] bp
+///     The LLDB instruction object used to populate the disassembly
+///     instruction.
+/// \return
+///     A "Scope" JSON object with that follows the formal JSON
+///     definition outlined by Microsoft.
+llvm::json::Value CreateInstructionBreakpoint(BreakpointBase *ibp);
+
 /// Create a "Thread" object for a LLDB thread object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 495ed0256120e..c5c4b09f15622 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -1723,6 +1723,8 @@ void request_initialize(const llvm::json::Object &request) {
   body.try_emplace("supportsLogPoints", true);
   // The debug adapter supports data watchpoints.
   body.try_emplace("supportsDataBreakpoints", true);
+  // The debug adapter support for instruction breakpoint.
+  body.try_emplace("supportsInstructionBreakpoints", true);
 
   // Put in non-DAP specification lldb specific information.
   llvm::json::Object lldb_json;
@@ -4082,6 +4084,254 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) {
   g_dap.SendJSON(llvm::json::Value(std::move(response)));
 }
 
+// "SetInstructionBreakpointsRequest" : {
+//   "allOf" : [
+//     {"$ref" : "#/definitions/Request"}, {
+//       "type" : "object",
+//       "description" :
+//           "Replaces all existing instruction breakpoints. Typically, "
+//           "instruction breakpoints would be set from a disassembly window. "
+//           "\nTo clear all instruction breakpoints, specify an empty "
+//           "array.\nWhen an instruction breakpoint is hit, a `stopped` event "
+//           "(with reason `instruction breakpoint`) is generated.\nClients "
+//           "should only call this request if the corresponding capability "
+//           "`supportsInstructionBreakpoints` is true.",
+//       "properties" : {
+//         "command" : {"type" : "string", "enum" :
+//         ["setInstructionBreakpoints"]}, "arguments" :
+//             {"$ref" : "#/definitions/SetInstructionBreakpointsArguments"}
+//       },
+//       "required" : [ "command", "arguments" ]
+//     }
+//   ]
+// },
+//                                      "SetInstructionBreakpointsArguments"
+//     : {
+//       "type" : "object",
+//       "description" : "Arguments for `setInstructionBreakpoints` request",
+//       "properties" : {
+//         "breakpoints" : {
+//           "type" : "array",
+//           "items" : {"$ref" : "#/definitions/InstructionBreakpoint"},
+//           "description" : "The instruction references of the breakpoints"
+//         }
+//       },
+//       "required" : ["breakpoints"]
+//     },
+//       "SetInstructionBreakpointsResponse"
+//     : {
+//       "allOf" : [
+//         {"$ref" : "#/definitions/Response"}, {
+//           "type" : "object",
+//           "description" : "Response to `setInstructionBreakpoints` request",
+//           "properties" : {
+//             "body" : {
+//               "type" : "object",
+//               "properties" : {
+//                 "breakpoints" : {
+//                   "type" : "array",
+//                   "items" : {"$ref" : "#/definitions/Breakpoint"},
+//                   "description" :
+//                       "Information about the breakpoints. The array elements
+//                       " "correspond to the elements of the `breakpoints`
+//                       array."
+//                 }
+//               },
+//               "required" : ["breakpoints"]
+//             }
+//           },
+//           "required" : ["body"]
+//         }
+//       ]
+//     },
+// "InstructionBreakpoint" : {
+//   "type" : "object",
+//   "description" : "Properties of a breakpoint passed to the "
+//                   "`setInstructionBreakpoints` request",
+//   "properties" : {
+//     "instructionReference" : {
+//       "type" : "string",
+//       "description" :
+//           "The instruction reference of the breakpoint.\nThis should be a "
+//           "memory or instruction pointer reference from an
+//           `EvaluateResponse`, "
+//           "`Variable`, `StackFrame`, `GotoTarget`, or `Breakpoint`."
+//     },
+//     "offset" : {
+//       "type" : "integer",
+//       "description" : "The offset from the instruction reference in "
+//                       "bytes.\nThis can be negative."
+//     },
+//     "condition" : {
+//       "type" : "string",
+//       "description" : "An expression for conditional breakpoints.\nIt is only
+//       "
+//                       "honored by a debug adapter if the corresponding "
+//                       "capability `supportsConditionalBreakpoints` is true."
+//     },
+//     "hitCondition" : {
+//       "type" : "string",
+//       "description" : "An expression that controls how many hits of the "
+//                       "breakpoint are ignored.\nThe debug adapter is expected
+//                       " "to interpret the expression as needed.\nThe
+//                       attribute " "is only honored by a debug adapter if the
+//                       corresponding " "capability
+//                       `supportsHitConditionalBreakpoints` is true."
+//     },
+//     "mode" : {
+//       "type" : "string",
+//       "description" : "The mode of this breakpoint. If defined, this must be
+//       "
+//                       "one of the `breakpointModes` the debug adapter "
+//                       "advertised in its `Capabilities`."
+//     }
+//   },
+//   "required" : ["instructionReference"]
+// },
+// "Breakpoint"
+//     : {
+//       "type" : "object",
+//       "description" :
+//           "Information about a breakpoint created in `setBreakpoints`, "
+//           "`setFunctionBreakpoints`, `setInstructionBreakpoints`, or "
+//           "`setDataBreakpoints` requests.",
+//       "properties" : {
+//         "id" : {
+//           "type" : "integer",
+//           "description" :
+//               "The identifier for the breakpoint. It is needed if breakpoint
+//               " "events are used to update or remove breakpoints."
+//         },
+//         "verified" : {
+//           "type" : "boolean",
+//           "description" : "If true, the breakpoint could be set (but not "
+//                           "necessarily at the desired location)."
+//         },
+//         "message" : {
+//           "type" : "string",
+//           "description" : "A message about the state of the breakpoint.\nThis
+//           "
+//                           "is shown to the user and can be used to explain
+//                           why " "a breakpoint could not be verified."
+//         },
+//         "source" : {
+//           "$ref" : "#/definitions/Source",
+//           "description" : "The source where the breakpoint is located."
+//         },
+//         "line" : {
+//           "type" : "integer",
+//           "description" :
+//               "The start line of the actual range covered by the breakpoint."
+//         },
+//         "column" : {
+//           "type" : "integer",
+//           "description" :
+//               "Start position of the source range covered by the breakpoint.
+//               " "It is measured in UTF-16 code units and the client
+//               capability "
+//               "`columnsStartAt1` determines whether it is 0- or 1-based."
+//         },
+//         "endLine" : {
+//           "type" : "integer",
+//           "description" :
+//               "The end line of the actual range covered by the breakpoint."
+//         },
+//         "endColumn" : {
+//           "type" : "integer",
+//           "description" :
+//               "End position of the source range covered by the breakpoint. It
+//               " "is measured in UTF-16 code units and the client capability "
+//               "`columnsStartAt1` determines whether it is 0- or 1-based.\nIf
+//               " "no end line is given, then the end column is assumed to be
+//               in " "the start line."
+//         },
+//         "instructionReference" : {
+//           "type" : "string",
+//           "description" : "A memory reference to where the breakpoint is
+//           set."
+//         },
+//         "offset" : {
+//           "type" : "integer",
+//           "description" : "The offset from the instruction reference.\nThis "
+//                           "can be negative."
+//         },
+//         "reason" : {
+//           "type" : "string",
+//           "description" :
+//               "A machine-readable explanation of why a breakpoint may not be
+//               " "verified. If a breakpoint is verified or a specific reason
+//               is " "not known, the adapter should omit this property.
+//               Possible " "values include:\n\n- `pending`: Indicates a
+//               breakpoint might be " "verified in the future, but the adapter
+//               cannot verify it in the " "current state.\n - `failed`:
+//               Indicates a breakpoint was not " "able to be verified, and the
+//               adapter does not believe it can be " "verified without
+//               intervention.",
+//           "enum" : [ "pending", "failed" ]
+//         }
+//       },
+//       "required" : ["verified"]
+//     },
+
+void request_setInstructionBreakpoints(const llvm::json::Object &request) {
+  llvm::json::Object response;
+  llvm::json::Array response_breakpoints;
+  llvm::json::Object body;
+  FillResponse(request, response);
+
+  auto arguments = request.getObject("arguments");
+  auto breakpoints = arguments->getArray("breakpoints");
+
+  // It holds active instruction breakpoint list received from DAP.
+  InstructionBreakpointMap request_ibp;
+  if (breakpoints) {
+    for (const auto &bp : *breakpoints) {
+      auto bp_obj = bp.getAsObject();
+      if (bp_obj) {
+        // Read instruction breakpoint request.
+        InstructionBreakpoint inst_bp(*bp_obj);
+        // Store them into map for reference.
+        request_ibp[inst_bp.instructionAddressReference] = std::move(inst_bp);
+      }
+    }
+
+    // Iterate previous active instruction breakpoint list.
+    for (auto &prev_ibp : g_dap.instruction_breakpoints) {
+      // Find previous instruction breakpoint reference address in newly
+      // received instruction breakpoint list.
+      auto inst_reference = request_ibp.find(prev_ibp.first);
+      // Request for remove and delete the breakpoint, if the prev instruction
+      // breakpoint ID is not available in active instrcation breakpoint list.
+      // Means delete removed breakpoint instance.
+      if (inst_reference == request_ibp.end()) {
+        g_dap.target.BreakpointDelete(prev_ibp.second.id);
+        // Update Prev instruction breakpoint list.
+        g_dap.instruction_breakpoints.erase(prev_ibp.first);
+      } else {
+        // Instead of recreating breakpoint instance, update the breakpoint if
+        // there are any conditional changes.
+        prev_ibp.second.UpdateBreakpoint(inst_reference->second);
+        request_ibp.erase(inst_reference);
+        response_breakpoints.emplace_back(
+            CreateInstructionBreakpoint(&prev_ibp.second));
+      }
+    }
+
+    for (auto &req_bpi : request_ibp) {
+      // Add this breakpoint info to the response
+      g_dap.instruction_breakpoints[req_bpi.first] = std::move(req_bpi.second);
+      InstructionBreakpoint &new_bp =
+          g_dap.instruction_breakpoints[req_bpi.first];
+      new_bp.SetInstructionBreakpoint();
+      response_breakpoints.emplace_back(CreateInstructionBreakpoint(&new_bp));
+    }
+  }
+
+  body.try_emplace("breakpoints", std::move(response_breakpoints));
+  response.try_emplace("body", std::move(body));
+  g_dap.SendJSON(llvm::json::Value(std::move(response)));
+}
+
 void RegisterRequestCallbacks() {
   g_dap.RegisterRequestCallback("attach", request_attach);
   g_dap.RegisterRequestCallback("completions", request_completions);
@@ -4114,6 +4364,9 @@ void RegisterRequestCallbacks() {
   g_dap.RegisterRequestCallback("threads", request_threads);
   g_dap.RegisterRequestCallback("variables", request_variables);
   g_dap.RegisterRequestCallback("disassemble", request_disassemble);
+  // Instruction breakpoint request
+  g_dap.RegisterRequestCallback("setInstructionBreakpoints",
+                                request_setInstructionBreakpoints);
   // Custom requests
   g_dap.RegisterRequestCallback("compileUnits", request_compileUnits);
   g_dap.RegisterRequestCallback("modules", request_modules);
diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp
index 82a3a0d6b4e51..75f51132aa9cc 100644
--- a/lldb/tools/lldb-server/lldb-platform.cpp
+++ b/lldb/tools/lldb-server/lldb-platform.cpp
@@ -47,108 +47,6 @@ using namespace llvm;
 
 // option descriptors for getopt_long_only()
 
-#ifdef _WIN32
-typedef pipe_t shared_fd_t;
-const shared_fd_t kInvalidSharedFD = LLDB_INVALID_PIPE;
-#else
-typedef NativeSocket shared_fd_t;
-const shared_fd_t kInvalidSharedFD = Socket::kInvalidSocketValue;
-#endif
-
-class SharedSocket {
-public:
-  SharedSocket(Connection *conn, Status &error) {
-    m_fd = kInvalidSharedFD;
-
-    const Socket *socket =
-        static_cast<const Socket *>(conn->GetReadObject().get());
-    if (socket == nullptr) {
-      error = Status("invalid conn socket");
-      return;
-    }
-
-#ifdef _WIN32
-    m_socket = socket->GetNativeSocket();
-
-    // Create a pipe to transfer WSAPROTOCOL_INFO to the child process.
-    error = m_socket_pipe.CreateNew(true);
-    if (error.Fail())
-      return;
-
-    m_fd = m_socket_pipe.GetReadPipe();
-#else
-    m_fd = socket->GetNativeSocket();
-    error = Status();
-#endif
-  }
-
-  shared_fd_t GetSendableFD() { return m_fd; }
-
-  Status CompleteSending(lldb::pid_t child_pid) {
-#ifdef _WIN32
-    // Transfer WSAPROTOCOL_INFO to the child process.
-    m_socket_pipe.CloseReadFileDescriptor();
-
-    WSAPROTOCOL_INFO protocol_info;
-    if (::WSADuplicateSocket(m_socket, child_pid, &protocol_info) ==
-        SOCKET_ERROR) {
-      int last_error = ::WSAGetLastError();
-      return Status("WSADuplicateSocket() failed, error: %d", last_error);
-    }
-
-    size_t num_bytes;
-    Status error =
-        m_socket_pipe.WriteWithTimeout(&protocol_info, sizeof(protocol_info),
-                                       std::chrono::seconds(10), num_bytes);
-    if (error.Fail())
-      return error;
-    if (num_bytes != sizeof(protocol_info))
-      return Status("WriteWithTimeout(WSAPROTOCOL_INFO) failed: %d bytes",
-                    num_bytes);
-#endif
-    return Status();
-  }
-
-  static Status GetNativeSocket(shared_fd_t fd, NativeSocket &socket) {
-#ifdef _WIN32
-    socket = Socket::kInvalidSocketValue;
-    // Read WSAPROTOCOL_INFO from the parent process and create NativeSocket.
-    WSAPROTOCOL_INFO protocol_info;
-    {
-      Pipe socket_pipe(fd, LLDB_INVALID_PIPE);
-      size_t num_bytes;
-      Status error =
-          socket_pipe.ReadWithTimeout(&protocol_info, sizeof(protocol_info),
-                                      std::chrono::seconds(10), num_bytes);
-      if (error.Fail())
-        return error;
-      if (num_bytes != sizeof(protocol_info)) {
-        return Status(
-            "socket_pipe.ReadWithTimeout(WSAPROTOCOL_INFO) failed: % d bytes",
-            num_bytes);
-      }
-    }
-    socket = ::WSASocket(FROM_PROTOCOL_INFO, FROM_PROTOCOL_INFO,
-                         FROM_PROTOCOL_INFO, &protocol_info, 0, 0);
-    if (socket == INVALID_SOCKET) {
-      return Status("WSASocket(FROM_PROTOCOL_INFO) failed: error %d",
-                    ::WSAGetLastError());
-    }
-    return Status();
-#else
-    socket = fd;
-    return Status();
-#endif
-  }
-
-private:
-#ifdef _WIN32
-  Pipe m_socket_pipe;
-  NativeSocket m_socket;
-#endif
-  shared_fd_t m_fd;
-};
-
 static int g_debug = 0;
 static int g_verbose = 0;
 static int g_server = 0;
@@ -259,13 +157,13 @@ static void spawn_process_reaped(lldb::pid_t pid, int signal, int status) {
   gdbserver_portmap.FreePortForProcess(pid);
 }
 
-static Status spawn_process(const char *progname, Connection *conn,
+static Status spawn_process(const char *progname, const Socket *conn_socket,
                             uint16_t gdb_port, uint16_t port_offset,
                             const lldb_private::Args &args,
                             const std::string &log_file,
                             const StringRef log_channels) {
   Status error;
-  SharedSocket shared_socket(conn, error);
+  SharedSocket shared_socket(conn_socket, error);
   if (error.Fail())
     return error;
 
@@ -363,7 +261,7 @@ int main_platform(int argc, char *argv[]) {
   StringRef
       log_channels; // e.g. "lldb process threads:gdb-remote default:linux all"
 
-  shared_fd_t fd = kInvalidSharedFD;
+  shared_fd_t fd = SharedSocket::kInvalidFD;
 
   int min_gdbserver_port = 0;
   int max_gdbserver_port = 0;
@@ -480,7 +378,7 @@ int main_platform(int argc, char *argv[]) {
   }
 
   // Print usage and exit if no listening port is specified.
-  if (listen_host_port.empty() && fd == kInvalidSharedFD)
+  if (listen_host_port.empty() && fd == SharedSocket::kInvalidFD)
     show_usage = true;
 
   if (show_usage || option_error) {
@@ -494,7 +392,7 @@ int main_platform(int argc, char *argv[]) {
   lldb_private::Args inferior_arguments;
   inferior_arguments.SetArguments(argc, const_cast<const char **>(argv));
 
-  if (fd != kInvalidSharedFD) {
+  if (fd != SharedSocket::kInvalidFD) {
     // Child process will handle the connection and exit.
     Log *log = GetLog(LLDBLog::Platform);
     if (!listen_host_port.empty()) {
@@ -510,13 +408,14 @@ int main_platform(int argc, char *argv[]) {
       return socket_error;
     }
 
-    Connection *conn =
-        new ConnectionFileDescriptor(new TCPSocket(socket, true, false));
     GDBRemoteCommunicationServerPlatform platform(Socket::ProtocolTcp, "tcp");
     if (port_offset > 0)
       platform.SetPortOffset(port_offset);
     platform.SetPortMap(std::move(gdbserver_portmap));
-    platform.SetConnection(std::unique_ptr<Connection>(conn));
+    platform.SetConnection(
+        std::unique_ptr<Connection>(new ConnectionFileDescriptor(
+            new TCPSocket(socket, /*should_close=*/true,
+                          /*child_processes_inherit=*/false))));
     client_handle(platform, inferior_arguments);
     return 0;
   }
@@ -578,8 +477,11 @@ int main_platform(int argc, char *argv[]) {
         fprintf(stderr,
                 "no available gdbserver port for connection - dropping...\n");
       } else {
-        error = spawn_process(progname, conn, *available_port, port_offset,
-                              inferior_arguments, log_file, log_channels);
+        const Socket *conn_socket =
+            static_cast<const Socket *>(conn->GetReadObject().get());
+        error =
+            spawn_process(progname, conn_socket, *available_port, port_offset,
+                          inferior_arguments, log_file, log_channels);
         if (error.Fail()) {
           {
 
diff --git a/lldb/unittests/Expression/ClangParserTest.cpp b/lldb/unittests/Expression/ClangParserTest.cpp
index 6f682f6c97fdb..fab4487c73719 100644
--- a/lldb/unittests/Expression/ClangParserTest.cpp
+++ b/lldb/unittests/Expression/ClangParserTest.cpp
@@ -42,8 +42,8 @@ TEST_F(ClangHostTest, ComputeClangResourceDirectory) {
 #else
   std::string path_to_liblldb = "C:\\foo\\bar\\lib\\";
 #endif
-  std::string path_to_clang_dir = clang::driver::Driver::GetResourcesPath(
-      path_to_liblldb + "liblldb", CLANG_RESOURCE_DIR);
+  std::string path_to_clang_dir =
+      clang::driver::Driver::GetResourcesPath(path_to_liblldb + "liblldb");
   llvm::SmallString<256> path_to_clang_lib_dir_real;
   llvm::sys::fs::real_path(path_to_clang_dir, path_to_clang_lib_dir_real);
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 445980a18e7e9..4887ae6ee9966 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2189,6 +2189,10 @@ example:
 ``nosanitize_coverage``
     This attribute indicates that SanitizerCoverage instrumentation is disabled
     for this function.
+``nosanitize_realtime``
+    This attribute indicates that the Realtime Sanitizer instrumentation is
+    disabled for this function.
+    This attribute is incompatible with the ``sanitize_realtime`` attribute.
 ``null_pointer_is_valid``
    If ``null_pointer_is_valid`` is set, then the ``null`` address
    in address-space 0 is considered to be a valid address for memory loads and
@@ -2315,6 +2319,7 @@ example:
     This attribute indicates that RealtimeSanitizer checks
     (realtime safety analysis - no allocations, syscalls or exceptions) are enabled
     for this function.
+    This attribute is incompatible with the ``nosanitize_realtime`` attribute.
 ``speculative_load_hardening``
     This attribute indicates that
     `Speculative Load Hardening <https://llvm.org/docs/SpeculativeLoadHardening.html>`_
diff --git a/llvm/docs/Statepoints.rst b/llvm/docs/Statepoints.rst
index ed137ed4fa8ed..27a46a43b5190 100644
--- a/llvm/docs/Statepoints.rst
+++ b/llvm/docs/Statepoints.rst
@@ -167,10 +167,11 @@ Let's consider a simple call in LLVM IR:
 
 .. code-block:: llvm
 
-  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj)
+  declare void @foo()
+  define ptr addrspace(1) @test1(ptr addrspace(1) %obj)
          gc "statepoint-example" {
-    call void ()* @foo()
-    ret i8 addrspace(1)* %obj
+    call void @foo()
+    ret ptr addrspace(1) %obj
   }
 
 Depending on our language we may need to allow a safepoint during the execution
@@ -186,11 +187,11 @@ resulting relocation sequence is:
 
 .. code-block:: llvm
 
-  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj)
+  define ptr addrspace(1) @test(ptr addrspace(1) %obj)
          gc "statepoint-example" {
-    %0 = call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj)
-    %obj.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %0, i32 7, i32 7)
-    ret i8 addrspace(1)* %obj.relocated
+    %safepoint = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, ptr elementtype(void ()) @foo, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %obj)]
+    %obj.relocated = call ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint, i32 0, i32 0)
+    ret ptr addrspace(1) %obj.relocated
   }
 
 Ideally, this sequence would have been represented as a M argument, N
@@ -269,10 +270,13 @@ collector:
 
 .. code-block:: llvm
 
-  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj)
-         gc "statepoint-example" {
-    call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj)
-    ret i8 addrspace(1)* %obj
+  define void @manual_frame(ptr %a, ptr %b) gc "statepoint-example" {
+    %alloca = alloca ptr
+    %allocb = alloca ptr
+    store ptr %a, ptr %alloca
+    store ptr %b, ptr %allocb
+    call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) @func, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr %alloca, ptr %allocb)]
+    ret void
   }
 
 Recording On Stack Regions
@@ -332,25 +336,6 @@ lowering both the base and derived pointer operands are required to be live
 over the associated call safepoint even if the base is otherwise unused
 afterwards.
 
-If we extend our previous example to include a pointless derived pointer,
-we get:
-
-.. code-block:: llvm
-
-  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj)
-         gc "statepoint-example" {
-    %gep = getelementptr i8, i8 addrspace(1)* %obj, i64 20000
-    %token = call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj, i8 addrspace(1)* %gep)
-    %obj.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %token, i32 7, i32 7)
-    %gep.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %token, i32 7, i32 8)
-    %p = getelementptr i8, i8 addrspace(1)* %gep, i64 -20000
-    ret i8 addrspace(1)* %p
-  }
-
-Note that in this example %p and %obj.relocate are the same address and we
-could replace one with the other, potentially removing the derived pointer
-from the live set at the safepoint entirely.
-
 .. _gc_transition_args:
 
 GC Transitions
@@ -564,21 +549,20 @@ As an example, given this code:
 
 .. code-block:: llvm
 
-  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj)
+  define ptr addrspace(1) @test1(ptr addrspace(1) %obj)
          gc "statepoint-example" {
     call void @foo()
-    ret i8 addrspace(1)* %obj
+    ret ptr addrspace(1) %obj
   }
 
 The pass would produce this IR:
 
 .. code-block:: llvm
 
-  define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj)
-         gc "statepoint-example" {
-    %0 = call token (i64, i32, void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i8 addrspace(1)* %obj)
-    %obj.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %0, i32 12, i32 12)
-    ret i8 addrspace(1)* %obj.relocated
+  define ptr addrspace(1) @test_rs4gc(ptr addrspace(1) %obj) gc "statepoint-example" {
+    %statepoint_token = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @foo, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(ptr addrspace(1) %obj) ]
+    %obj.relocated = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %statepoint_token, i32 0, i32 0) ; (%obj, %obj)
+    ret ptr addrspace(1) %obj.relocated
   }
 
 In the above examples, the addrspace(1) marker on the pointers is the mechanism
diff --git a/llvm/docs/WritingAnLLVMPass.rst b/llvm/docs/WritingAnLLVMPass.rst
index d2afa3913ee07..31194e8b0389c 100644
--- a/llvm/docs/WritingAnLLVMPass.rst
+++ b/llvm/docs/WritingAnLLVMPass.rst
@@ -489,7 +489,7 @@ spanning the range from ``DominatorSet`` to ``BreakCriticalEdges``.  Requiring
 edges in the CFG when your pass has been run.
 
 Some analyses chain to other analyses to do their job.  For example, an
-`AliasAnalysis <AliasAnalysis>` implementation is required to :ref:`chain
+`AliasAnalysis <AliasAnalysis.html>`_ implementation is required to :ref:`chain
 <aliasanalysis-chaining>` to other alias analysis passes.  In cases where
 analyses chain, the ``addRequiredTransitive`` method should be used instead of
 the ``addRequired`` method.  This informs the ``PassManager`` that the
diff --git a/llvm/include/llvm/ADT/STLFunctionalExtras.h b/llvm/include/llvm/ADT/STLFunctionalExtras.h
index dd7fc6dc74864..6f172504b3c16 100644
--- a/llvm/include/llvm/ADT/STLFunctionalExtras.h
+++ b/llvm/include/llvm/ADT/STLFunctionalExtras.h
@@ -69,6 +69,10 @@ class function_ref<Ret(Params...)> {
   }
 
   explicit operator bool() const { return callback; }
+
+  bool operator==(const function_ref<Ret(Params...)> &Other) const {
+    return callable == Other.callable;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index d9b4e968fe3e9..6b577c02f0545 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -164,6 +164,10 @@ class ResourceInfo {
     UAVFlags.HasCounter = HasCounter;
     UAVFlags.IsROV = IsROV;
   }
+  const UAVInfo &getUAV() const {
+    assert(isUAV() && "Not a UAV");
+    return UAVFlags;
+  }
   void setCBuffer(uint32_t Size) {
     assert(isCBuffer() && "Not a CBuffer");
     CBufferSize = Size;
@@ -179,6 +183,10 @@ class ResourceInfo {
     Typed.ElementTy = ElementTy;
     Typed.ElementCount = ElementCount;
   }
+  const TypedInfo &getTyped() const {
+    assert(isTyped() && "Not typed");
+    return Typed;
+  }
   void setFeedback(dxil::SamplerFeedbackType Type) {
     assert(isFeedback() && "Not Feedback");
     Feedback.Type = Type;
@@ -187,8 +195,14 @@ class ResourceInfo {
     assert(isMultiSample() && "Not MultiSampled");
     MultiSample.Count = Count;
   }
+  const MSInfo &getMultiSample() const {
+    assert(isMultiSample() && "Not MultiSampled");
+    return MultiSample;
+  }
 
+  StringRef getName() const { return Name; }
   dxil::ResourceClass getResourceClass() const { return RC; }
+  dxil::ResourceKind getResourceKind() const { return Kind; }
 
   bool operator==(const ResourceInfo &RHS) const;
   bool operator!=(const ResourceInfo &RHS) const { return !(*this == RHS); }
@@ -280,6 +294,46 @@ class DXILResourceMap {
                                 : (Resources.begin() + Pos->second);
   }
 
+  iterator srv_begin() { return begin(); }
+  const_iterator srv_begin() const { return begin(); }
+  iterator srv_end() { return begin() + FirstUAV; }
+  const_iterator srv_end() const { return begin() + FirstUAV; }
+  iterator_range<iterator> srvs() { return make_range(srv_begin(), srv_end()); }
+  iterator_range<const_iterator> srvs() const {
+    return make_range(srv_begin(), srv_end());
+  }
+
+  iterator uav_begin() { return begin() + FirstUAV; }
+  const_iterator uav_begin() const { return begin() + FirstUAV; }
+  iterator uav_end() { return begin() + FirstCBuffer; }
+  const_iterator uav_end() const { return begin() + FirstCBuffer; }
+  iterator_range<iterator> uavs() { return make_range(uav_begin(), uav_end()); }
+  iterator_range<const_iterator> uavs() const {
+    return make_range(uav_begin(), uav_end());
+  }
+
+  iterator cbuffer_begin() { return begin() + FirstCBuffer; }
+  const_iterator cbuffer_begin() const { return begin() + FirstCBuffer; }
+  iterator cbuffer_end() { return begin() + FirstSampler; }
+  const_iterator cbuffer_end() const { return begin() + FirstSampler; }
+  iterator_range<iterator> cbuffers() {
+    return make_range(cbuffer_begin(), cbuffer_end());
+  }
+  iterator_range<const_iterator> cbuffers() const {
+    return make_range(cbuffer_begin(), cbuffer_end());
+  }
+
+  iterator sampler_begin() { return begin() + FirstSampler; }
+  const_iterator sampler_begin() const { return begin() + FirstSampler; }
+  iterator sampler_end() { return end(); }
+  const_iterator sampler_end() const { return end(); }
+  iterator_range<iterator> samplers() {
+    return make_range(sampler_begin(), sampler_end());
+  }
+  iterator_range<const_iterator> samplers() const {
+    return make_range(sampler_begin(), sampler_end());
+  }
+
   void print(raw_ostream &OS) const;
 };
 
diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
index af72f6e0f90b1..ee447d3e4ebb6 100644
--- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
@@ -15,7 +15,6 @@
 #define LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
 
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
@@ -187,12 +186,7 @@ class FunctionPropertiesUpdater {
   static bool isUpdateValid(Function &F, const FunctionPropertiesInfo &FPI,
                             FunctionAnalysisManager &FAM);
 
-  DominatorTree &getUpdatedDominatorTree(FunctionAnalysisManager &FAM) const;
-
   DenseSet<const BasicBlock *> Successors;
-
-  // Edges we might potentially need to remove from the dominator tree.
-  SmallVector<DominatorTree::UpdateType, 2> DomTreeUpdates;
 };
 } // namespace llvm
 #endif // LLVM_ANALYSIS_FUNCTIONPROPERTIESANALYSIS_H
diff --git a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
index eada6a647763b..12b906ec9dd58 100644
--- a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
+++ b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h
@@ -44,7 +44,7 @@ class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> {
   friend class InstVisitor<UnrolledInstAnalyzer, bool>;
   struct SimplifiedAddress {
     Value *Base = nullptr;
-    ConstantInt *Offset = nullptr;
+    APInt Offset;
   };
 
 public:
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 5154e2f6659c1..fe46a504bce5d 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -887,8 +887,8 @@ class ScalarEvolution {
   /// SCEV predicates to Predicates that are required to be true in order for
   /// the answer to be correct. Predicates can be checked with run-time
   /// checks and can be used to perform loop versioning.
-  const SCEV *getPredicatedBackedgeTakenCount(const Loop *L,
-                                              SmallVector<const SCEVPredicate *, 4> &Predicates);
+  const SCEV *getPredicatedBackedgeTakenCount(
+      const Loop *L, SmallVectorImpl<const SCEVPredicate *> &Predicates);
 
   /// When successful, this returns a SCEVConstant that is greater than or equal
   /// to (i.e. a "conservative over-approximation") of the value returend by
@@ -911,7 +911,7 @@ class ScalarEvolution {
   /// the answer to be correct. Predicates can be checked with run-time
   /// checks and can be used to perform loop versioning.
   const SCEV *getPredicatedSymbolicMaxBackedgeTakenCount(
-      const Loop *L, SmallVector<const SCEVPredicate *, 4> &Predicates);
+      const Loop *L, SmallVectorImpl<const SCEVPredicate *> &Predicates);
 
   /// Return true if the backedge taken count is either the value returned by
   /// getConstantMaxBackedgeTakenCount or zero.
@@ -1556,8 +1556,9 @@ class ScalarEvolution {
     /// If we allowed SCEV predicates to be generated when populating this
     /// vector, this information can contain them and therefore a
     /// SCEVPredicate argument should be added to getExact.
-    const SCEV *getExact(const Loop *L, ScalarEvolution *SE,
-                         SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr) const;
+    const SCEV *getExact(
+        const Loop *L, ScalarEvolution *SE,
+        SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr) const;
 
     /// Return the number of times this loop exit may fall through to the back
     /// edge, or SCEVCouldNotCompute. The loop is guaranteed not to exit via
@@ -1574,9 +1575,9 @@ class ScalarEvolution {
                                ScalarEvolution *SE) const;
 
     /// Get the symbolic max backedge taken count for the loop.
-    const SCEV *
-    getSymbolicMax(const Loop *L, ScalarEvolution *SE,
-                   SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr);
+    const SCEV *getSymbolicMax(
+        const Loop *L, ScalarEvolution *SE,
+        SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
 
     /// Get the symbolic max backedge taken count for the particular loop exit.
     const SCEV *getSymbolicMax(const BasicBlock *ExitingBlock,
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 4beac37a58344..8a2e6583af87c 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -759,6 +759,7 @@ enum AttributeKindCodes {
   ATTR_KIND_INITIALIZES = 94,
   ATTR_KIND_HYBRID_PATCHABLE = 95,
   ATTR_KIND_SANITIZE_REALTIME = 96,
+  ATTR_KIND_NO_SANITIZE_REALTIME = 97,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h
index 932a2a94ab1f1..d5a63c8dd627a 100644
--- a/llvm/include/llvm/CodeGen/CallingConvLower.h
+++ b/llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -81,16 +81,16 @@ class CCValAssign {
   }
 
 public:
-  static CCValAssign getReg(unsigned ValNo, MVT ValVT, unsigned RegNo,
+  static CCValAssign getReg(unsigned ValNo, MVT ValVT, MCRegister Reg,
                             MVT LocVT, LocInfo HTP, bool IsCustom = false) {
     CCValAssign Ret(HTP, ValNo, ValVT, LocVT, IsCustom);
-    Ret.Data = Register(RegNo);
+    Ret.Data = Register(Reg);
     return Ret;
   }
 
-  static CCValAssign getCustomReg(unsigned ValNo, MVT ValVT, unsigned RegNo,
+  static CCValAssign getCustomReg(unsigned ValNo, MVT ValVT, MCRegister Reg,
                                   MVT LocVT, LocInfo HTP) {
-    return getReg(ValNo, ValVT, RegNo, LocVT, HTP, /*IsCustom=*/true);
+    return getReg(ValNo, ValVT, Reg, LocVT, HTP, /*IsCustom=*/true);
   }
 
   static CCValAssign getMem(unsigned ValNo, MVT ValVT, int64_t Offset,
@@ -112,7 +112,7 @@ class CCValAssign {
     return Ret;
   }
 
-  void convertToReg(unsigned RegNo) { Data = Register(RegNo); }
+  void convertToReg(MCRegister Reg) { Data = Register(Reg); }
 
   void convertToMem(int64_t Offset) { Data = Offset; }
 
@@ -346,7 +346,7 @@ class CCState {
   /// AllocateReg - Attempt to allocate one of the specified registers.  If none
   /// are available, return zero.  Otherwise, return the first one available,
   /// marking it and any aliases as allocated.
-  MCPhysReg AllocateReg(ArrayRef<MCPhysReg> Regs) {
+  MCRegister AllocateReg(ArrayRef<MCPhysReg> Regs) {
     unsigned FirstUnalloc = getFirstUnallocated(Regs);
     if (FirstUnalloc == Regs.size())
       return MCRegister();    // Didn't find the reg.
diff --git a/llvm/include/llvm/CodeGen/LiveVariables.h b/llvm/include/llvm/CodeGen/LiveVariables.h
index b73850bb757ec..89d1b5edf3fa6 100644
--- a/llvm/include/llvm/CodeGen/LiveVariables.h
+++ b/llvm/include/llvm/CodeGen/LiveVariables.h
@@ -253,8 +253,8 @@ class LiveVariables {
       return false;
 
     bool Removed = false;
-    for (MachineOperand &MO : MI.operands()) {
-      if (MO.isReg() && MO.isDef() && MO.getReg() == Reg) {
+    for (MachineOperand &MO : MI.all_defs()) {
+      if (MO.getReg() == Reg) {
         MO.setIsDead(false);
         Removed = true;
         break;
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 882cadea22369..b8a3c2ac2ac83 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1018,10 +1018,16 @@ class TargetInstrInfo : public MCInstrInfo {
   /// The source and destination registers may overlap, which may require a
   /// careful implementation when multiple copy instructions are required for
   /// large registers. See for example the ARM target.
+  ///
+  /// If RenamableDest is true, the copy instruction's destination operand is
+  /// marked renamable.
+  /// If RenamableSrc is true, the copy instruction's source operand is
+  /// marked renamable.
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, const DebugLoc &DL,
-                           MCRegister DestReg, MCRegister SrcReg,
-                           bool KillSrc) const {
+                           MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
+                           bool RenamableDest = false,
+                           bool RenamableSrc = false) const {
     llvm_unreachable("Target didn't implement TargetInstrInfo::copyPhysReg!");
   }
 
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 891e34fec0c79..80936c0ee8335 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -212,6 +212,9 @@ def NoSanitizeBounds : EnumAttr<"nosanitize_bounds", [FnAttr]>;
 /// No SanitizeCoverage instrumentation.
 def NoSanitizeCoverage : EnumAttr<"nosanitize_coverage", [FnAttr]>;
 
+/// No SanitizeRealtime instrumentation.
+def NoSanitizeRealtime : EnumAttr<"nosanitize_realtime", [FnAttr]>;
+
 /// Null pointer in address space zero is valid.
 def NullPointerIsValid : EnumAttr<"null_pointer_is_valid", [FnAttr]>;
 
diff --git a/llvm/include/llvm/IR/Statepoint.h b/llvm/include/llvm/IR/Statepoint.h
index 21c4a3eaa5ac2..8c508492f704a 100644
--- a/llvm/include/llvm/IR/Statepoint.h
+++ b/llvm/include/llvm/IR/Statepoint.h
@@ -176,23 +176,23 @@ class GCStatepointInst : public CallBase {
   }
 
   /// Returns an iterator to the begining of the argument range describing gc
-  /// values for the statepoint.
-  const_op_iterator gc_args_begin() const {
+  /// live values for the statepoint.
+  const_op_iterator gc_live_begin() const {
     if (auto Opt = getOperandBundle(LLVMContext::OB_gc_live))
       return Opt->Inputs.begin();
     return arg_end();
   }
 
-  /// Return an end iterator for the gc argument range
-  const_op_iterator gc_args_end() const {
+  /// Return an end iterator for the gc live range
+  const_op_iterator gc_live_end() const {
     if (auto Opt = getOperandBundle(LLVMContext::OB_gc_live))
       return Opt->Inputs.end();
     return arg_end();
   }
 
-  /// range adapter for gc arguments
-  iterator_range<const_op_iterator> gc_args() const {
-    return make_range(gc_args_begin(), gc_args_end());
+  /// range adapter for gc live arguments
+  iterator_range<const_op_iterator> gc_live() const {
+    return make_range(gc_live_begin(), gc_live_end());
   }
 
 
diff --git a/llvm/include/llvm/MC/MCInst.h b/llvm/include/llvm/MC/MCInst.h
index 578b7328970b7..b3d615b4392f5 100644
--- a/llvm/include/llvm/MC/MCInst.h
+++ b/llvm/include/llvm/MC/MCInst.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/bit.h"
+#include "llvm/MC/MCRegister.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
 #include <cstddef>
@@ -66,15 +67,15 @@ class MCOperand {
   bool isInst() const { return Kind == kInst; }
 
   /// Returns the register number.
-  unsigned getReg() const {
+  MCRegister getReg() const {
     assert(isReg() && "This is not a register operand!");
     return RegVal;
   }
 
   /// Set the register number.
-  void setReg(unsigned Reg) {
+  void setReg(MCRegister Reg) {
     assert(isReg() && "This is not a register operand!");
-    RegVal = Reg;
+    RegVal = Reg.id();
   }
 
   int64_t getImm() const {
@@ -131,10 +132,10 @@ class MCOperand {
     InstVal = Val;
   }
 
-  static MCOperand createReg(unsigned Reg) {
+  static MCOperand createReg(MCRegister Reg) {
     MCOperand Op;
     Op.Kind = kRegister;
-    Op.RegVal = Reg;
+    Op.RegVal = Reg.id();
     return Op;
   }
 
diff --git a/llvm/include/llvm/MC/MCInstBuilder.h b/llvm/include/llvm/MC/MCInstBuilder.h
index d06ed4c6c840a..de45ffb4b2dc7 100644
--- a/llvm/include/llvm/MC/MCInstBuilder.h
+++ b/llvm/include/llvm/MC/MCInstBuilder.h
@@ -34,7 +34,7 @@ class MCInstBuilder {
   }
 
   /// Add a new register operand.
-  MCInstBuilder &addReg(unsigned Reg) {
+  MCInstBuilder &addReg(MCRegister Reg) {
     Inst.addOperand(MCOperand::createReg(Reg));
     return *this;
   }
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index 13ad1c38f3b3b..32905c1e9a424 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -54,20 +54,21 @@
 #ifndef LLVM_MC_MCPSEUDOPROBE_H
 #define LLVM_MC_MCPSEUDOPROBE_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/IR/PseudoProbe.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/ErrorOr.h"
-#include <list>
-#include <map>
+#include <functional>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <type_traits>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -86,7 +87,7 @@ enum class MCPseudoProbeFlag {
 struct MCPseudoProbeFuncDesc {
   uint64_t FuncGUID = 0;
   uint64_t FuncHash = 0;
-  std::string FuncName;
+  StringRef FuncName;
 
   MCPseudoProbeFuncDesc(uint64_t GUID, uint64_t Hash, StringRef Name)
       : FuncGUID(GUID), FuncHash(Hash), FuncName(Name){};
@@ -100,17 +101,24 @@ class MCDecodedPseudoProbe;
 using InlineSite = std::tuple<uint64_t, uint32_t>;
 using MCPseudoProbeInlineStack = SmallVector<InlineSite, 8>;
 // GUID to PseudoProbeFuncDesc map
-using GUIDProbeFunctionMap =
-    std::unordered_map<uint64_t, MCPseudoProbeFuncDesc>;
-// Address to pseudo probes map.
-using AddressProbesMap = std::map<uint64_t, std::list<MCDecodedPseudoProbe>>;
+class GUIDProbeFunctionMap : public std::vector<MCPseudoProbeFuncDesc> {
+public:
+  auto find(uint64_t GUID) const {
+    auto CompareDesc = [](const MCPseudoProbeFuncDesc &Desc, uint64_t GUID) {
+      return Desc.FuncGUID < GUID;
+    };
+    auto It = llvm::lower_bound(*this, GUID, CompareDesc);
+    if (It->FuncGUID != GUID)
+      return end();
+    return It;
+  }
+};
 
 class MCDecodedPseudoProbeInlineTree;
 
 class MCPseudoProbeBase {
 protected:
-  uint64_t Guid;
-  uint64_t Index;
+  uint32_t Index;
   uint32_t Discriminator;
   uint8_t Attributes;
   uint8_t Type;
@@ -120,14 +128,12 @@ class MCPseudoProbeBase {
   const static uint32_t PseudoProbeFirstId = 1;
 
 public:
-  MCPseudoProbeBase(uint64_t G, uint64_t I, uint64_t At, uint8_t T, uint32_t D)
-      : Guid(G), Index(I), Discriminator(D), Attributes(At), Type(T) {}
+  MCPseudoProbeBase(uint64_t I, uint64_t At, uint8_t T, uint32_t D)
+      : Index(I), Discriminator(D), Attributes(At), Type(T) {}
 
   bool isEntry() const { return Index == PseudoProbeFirstId; }
 
-  uint64_t getGuid() const { return Guid; }
-
-  uint64_t getIndex() const { return Index; }
+  uint32_t getIndex() const { return Index; }
 
   uint32_t getDiscriminator() const { return Discriminator; }
 
@@ -157,18 +163,20 @@ class MCPseudoProbeBase {
 /// uses an address from a temporary label created at the current address in the
 /// current section.
 class MCPseudoProbe : public MCPseudoProbeBase {
+  uint64_t Guid;
   MCSymbol *Label;
 
 public:
   MCPseudoProbe(MCSymbol *Label, uint64_t Guid, uint64_t Index, uint64_t Type,
                 uint64_t Attributes, uint32_t Discriminator)
-      : MCPseudoProbeBase(Guid, Index, Attributes, Type, Discriminator),
+      : MCPseudoProbeBase(Index, Attributes, Type, Discriminator), Guid(Guid),
         Label(Label) {
     assert(Type <= 0xFF && "Probe type too big to encode, exceeding 2^8");
     assert(Attributes <= 0xFF &&
            "Probe attributes too big to encode, exceeding 2^16");
   }
 
+  uint64_t getGuid() const { return Guid; };
   MCSymbol *getLabel() const { return Label; }
   void emit(MCObjectStreamer *MCOS, const MCPseudoProbe *LastProbe) const;
 };
@@ -181,11 +189,11 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
   MCDecodedPseudoProbeInlineTree *InlineTree;
 
 public:
-  MCDecodedPseudoProbe(uint64_t Ad, uint64_t G, uint32_t I, PseudoProbeType K,
-                       uint8_t At, uint32_t D,
-                       MCDecodedPseudoProbeInlineTree *Tree)
-      : MCPseudoProbeBase(G, I, At, static_cast<uint8_t>(K), D), Address(Ad),
+  MCDecodedPseudoProbe(uint64_t Ad, uint32_t I, PseudoProbeType K, uint8_t At,
+                       uint32_t D, MCDecodedPseudoProbeInlineTree *Tree)
+      : MCPseudoProbeBase(I, At, static_cast<uint8_t>(K), D), Address(Ad),
         InlineTree(Tree){};
+  uint64_t getGuid() const;
 
   uint64_t getAddress() const { return Address; }
 
@@ -211,21 +219,39 @@ class MCDecodedPseudoProbe : public MCPseudoProbeBase {
              bool ShowName) const;
 };
 
-template <typename ProbeType, typename DerivedProbeInlineTreeType>
-class MCPseudoProbeInlineTreeBase {
-  struct InlineSiteHash {
-    uint64_t operator()(const InlineSite &Site) const {
-      return std::get<0>(Site) ^ std::get<1>(Site);
-    }
-  };
+// Address to pseudo probes map.
+class AddressProbesMap
+    : public std::vector<std::reference_wrapper<MCDecodedPseudoProbe>> {
+  auto getIt(uint64_t Addr) const {
+    auto CompareProbe = [](const MCDecodedPseudoProbe &Probe, uint64_t Addr) {
+      return Probe.getAddress() < Addr;
+    };
+    return llvm::lower_bound(*this, Addr, CompareProbe);
+  }
 
+public:
+  // Returns range of probes within [\p From, \p To) address range.
+  auto find(uint64_t From, uint64_t To) const {
+    return llvm::make_range(getIt(From), getIt(To));
+  }
+  // Returns range of probes with given \p Address.
+  auto find(uint64_t Address) const {
+    auto FromIt = getIt(Address);
+    if (FromIt == end() || FromIt->get().getAddress() != Address)
+      return llvm::make_range(end(), end());
+    auto ToIt = getIt(Address + 1);
+    return llvm::make_range(FromIt, ToIt);
+  }
+};
+
+template <typename ProbesType, typename DerivedProbeInlineTreeType,
+          typename InlinedProbeTreeMap>
+class MCPseudoProbeInlineTreeBase {
 protected:
   // Track children (e.g. inlinees) of current context
-  using InlinedProbeTreeMap = std::unordered_map<
-      InlineSite, std::unique_ptr<DerivedProbeInlineTreeType>, InlineSiteHash>;
   InlinedProbeTreeMap Children;
   // Set of probes that come with the function.
-  std::vector<ProbeType> Probes;
+  ProbesType Probes;
   MCPseudoProbeInlineTreeBase() {
     static_assert(std::is_base_of<MCPseudoProbeInlineTreeBase,
                                   DerivedProbeInlineTreeType>::value,
@@ -240,12 +266,10 @@ class MCPseudoProbeInlineTreeBase {
   bool isRoot() const { return Guid == 0; }
   InlinedProbeTreeMap &getChildren() { return Children; }
   const InlinedProbeTreeMap &getChildren() const { return Children; }
-  std::vector<ProbeType> &getProbes() { return Probes; }
-  const std::vector<ProbeType> &getProbes() const { return Probes; }
-  void addProbes(ProbeType Probe) { Probes.push_back(Probe); }
+  const ProbesType &getProbes() const { return Probes; }
   // Caller node of the inline site
-  MCPseudoProbeInlineTreeBase<ProbeType, DerivedProbeInlineTreeType> *Parent =
-      nullptr;
+  MCPseudoProbeInlineTreeBase<ProbesType, DerivedProbeInlineTreeType,
+                              InlinedProbeTreeMap> *Parent = nullptr;
   DerivedProbeInlineTreeType *getOrAddNode(const InlineSite &Site) {
     auto Ret = Children.emplace(
         Site, std::make_unique<DerivedProbeInlineTreeType>(Site));
@@ -259,9 +283,17 @@ class MCPseudoProbeInlineTreeBase {
 // instance is created as the root of a tree.
 // A real instance of this class is created for each function, either a
 // not inlined function that has code in .text section or an inlined function.
+struct InlineSiteHash {
+  uint64_t operator()(const InlineSite &Site) const {
+    return std::get<0>(Site) ^ std::get<1>(Site);
+  }
+};
 class MCPseudoProbeInlineTree
-    : public MCPseudoProbeInlineTreeBase<MCPseudoProbe,
-                                         MCPseudoProbeInlineTree> {
+    : public MCPseudoProbeInlineTreeBase<
+          std::vector<MCPseudoProbe>, MCPseudoProbeInlineTree,
+          std::unordered_map<InlineSite,
+                             std::unique_ptr<MCPseudoProbeInlineTree>,
+                             InlineSiteHash>> {
 public:
   MCPseudoProbeInlineTree() = default;
   MCPseudoProbeInlineTree(uint64_t Guid) { this->Guid = Guid; }
@@ -277,16 +309,31 @@ class MCPseudoProbeInlineTree
 
 // inline tree node for the decoded pseudo probe
 class MCDecodedPseudoProbeInlineTree
-    : public MCPseudoProbeInlineTreeBase<MCDecodedPseudoProbe *,
-                                         MCDecodedPseudoProbeInlineTree> {
-public:
-  InlineSite ISite;
+    : public MCPseudoProbeInlineTreeBase<
+          MCDecodedPseudoProbe *, MCDecodedPseudoProbeInlineTree,
+          MutableArrayRef<MCDecodedPseudoProbeInlineTree>> {
+  uint32_t NumProbes = 0;
+  uint32_t ProbeId = 0;
 
+public:
   MCDecodedPseudoProbeInlineTree() = default;
-  MCDecodedPseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};
+  MCDecodedPseudoProbeInlineTree(const InlineSite &Site,
+                                 MCDecodedPseudoProbeInlineTree *Parent)
+      : ProbeId(std::get<1>(Site)) {
+    this->Guid = std::get<0>(Site);
+    this->Parent = Parent;
+  }
 
   // Return false if it's a dummy inline site
   bool hasInlineSite() const { return !isRoot() && !Parent->isRoot(); }
+  InlineSite getInlineSite() const { return InlineSite(Guid, ProbeId); }
+  void setProbes(MutableArrayRef<MCDecodedPseudoProbe> ProbesRef) {
+    Probes = ProbesRef.data();
+    NumProbes = ProbesRef.size();
+  }
+  auto getProbes() const {
+    return MutableArrayRef<MCDecodedPseudoProbe>(Probes, NumProbes);
+  }
 };
 
 /// Instances of this class represent the pseudo probes inserted into a compile
@@ -336,9 +383,25 @@ class MCPseudoProbeTable {
 };
 
 class MCPseudoProbeDecoder {
+  // Decoded pseudo probes vector.
+  std::vector<MCDecodedPseudoProbe> PseudoProbeVec;
+  // Injected pseudo probes, identified by the containing inline tree node.
+  // Need to keep injected probes separately for two reasons:
+  // 1) Probes cannot be added to the PseudoProbeVec: appending may cause
+  //    reallocation so that pointers to its elements will become invalid.
+  // 2) Probes belonging to function record must be contiguous in PseudoProbeVec
+  //    as owning InlineTree references them with an ArrayRef to save space.
+  std::unordered_map<const MCDecodedPseudoProbeInlineTree *,
+                     std::vector<MCDecodedPseudoProbe>>
+      InjectedProbeMap;
+  // Decoded inline records vector.
+  std::vector<MCDecodedPseudoProbeInlineTree> InlineTreeVec;
+
   // GUID to PseudoProbeFuncDesc map.
   GUIDProbeFunctionMap GUID2FuncDescMap;
 
+  BumpPtrAllocator FuncNameAllocator;
+
   // Address to probes map.
   AddressProbesMap Address2ProbesMap;
 
@@ -370,16 +433,18 @@ class MCPseudoProbeDecoder {
   // Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map.
   bool buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size);
 
+  // Decode pseudo_probe section to count the number of probes and inlined
+  // function records for each function record.
+  template <bool IsTopLevelFunc>
+  bool countRecords(bool &Discard, uint32_t &ProbeCount, uint32_t &InlinedCount,
+                    const Uint64Set &GuidFilter);
+
   // Decode pseudo_probe section to build address to probes map for specifed
   // functions only.
   bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size,
                              const Uint64Set &GuildFilter,
                              const Uint64Map &FuncStartAddrs);
 
-  bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
-                             uint64_t &LastAddr, const Uint64Set &GuildFilter,
-                             const Uint64Map &FuncStartAddrs);
-
   // Print pseudo_probe_desc section info
   void printGUID2FuncDescMap(raw_ostream &OS);
 
@@ -422,6 +487,34 @@ class MCPseudoProbeDecoder {
   const MCDecodedPseudoProbeInlineTree &getDummyInlineRoot() const {
     return DummyInlineRoot;
   }
+
+  void addInjectedProbe(const MCDecodedPseudoProbe &Probe, uint64_t Address) {
+    const MCDecodedPseudoProbeInlineTree *Parent = Probe.getInlineTreeNode();
+    InjectedProbeMap[Parent].emplace_back(Probe).setAddress(Address);
+  }
+
+  size_t
+  getNumInjectedProbes(const MCDecodedPseudoProbeInlineTree *Parent) const {
+    auto It = InjectedProbeMap.find(Parent);
+    if (It == InjectedProbeMap.end())
+      return 0;
+    return It->second.size();
+  }
+
+  auto getInjectedProbes(MCDecodedPseudoProbeInlineTree *Parent) {
+    auto It = InjectedProbeMap.find(Parent);
+    assert(It != InjectedProbeMap.end());
+    return iterator_range(It->second);
+  }
+
+private:
+  // Recursively parse an inlining tree encoded in pseudo_probe section. Returns
+  // whether the the top-level node should be skipped.
+  template <bool IsTopLevelFunc>
+  bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
+                             uint64_t &LastAddr, const Uint64Set &GuildFilter,
+                             const Uint64Map &FuncStartAddrs,
+                             const uint32_t CurChildIndex);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index b8a28669cdd07..e4c48e39a4001 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -111,6 +111,8 @@ class ConstantInt;
 class Context;
 class Function;
 class Instruction;
+class FreezeInst;
+class FenceInst;
 class SelectInst;
 class ExtractElementInst;
 class InsertElementInst;
@@ -127,6 +129,7 @@ class CallBase;
 class CallInst;
 class InvokeInst;
 class CallBrInst;
+class LandingPadInst;
 class FuncletPadInst;
 class CatchPadInst;
 class CleanupPadInst;
@@ -134,6 +137,7 @@ class CatchReturnInst;
 class CleanupReturnInst;
 class GetElementPtrInst;
 class CastInst;
+class PossiblyNonNegInst;
 class PtrToIntInst;
 class BitCastInst;
 class AllocaInst;
@@ -141,6 +145,7 @@ class CatchSwitchInst;
 class SwitchInst;
 class UnaryOperator;
 class BinaryOperator;
+class PossiblyDisjointInst;
 class AtomicRMWInst;
 class AtomicCmpXchgInst;
 
@@ -249,6 +254,8 @@ class Value {
   friend class Context;               // For getting `Val`.
   friend class User;                  // For getting `Val`.
   friend class Use;                   // For getting `Val`.
+  friend class FreezeInst;            // For getting `Val`.
+  friend class FenceInst;             // For getting `Val`.
   friend class SelectInst;            // For getting `Val`.
   friend class ExtractElementInst;    // For getting `Val`.
   friend class InsertElementInst;     // For getting `Val`.
@@ -261,6 +268,7 @@ class Value {
   friend class CallInst;              // For getting `Val`.
   friend class InvokeInst;            // For getting `Val`.
   friend class CallBrInst;            // For getting `Val`.
+  friend class LandingPadInst;        // For getting `Val`.
   friend class FuncletPadInst;        // For getting `Val`.
   friend class CatchPadInst;          // For getting `Val`.
   friend class CleanupPadInst;        // For getting `Val`.
@@ -678,6 +686,8 @@ class Instruction : public sandboxir::User {
   /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This
   /// returns its topmost LLVM IR instruction.
   llvm::Instruction *getTopmostLLVMInstruction() const;
+  friend class FreezeInst;         // For getTopmostLLVMInstruction().
+  friend class FenceInst;          // For getTopmostLLVMInstruction().
   friend class SelectInst;         // For getTopmostLLVMInstruction().
   friend class ExtractElementInst; // For getTopmostLLVMInstruction().
   friend class InsertElementInst;  // For getTopmostLLVMInstruction().
@@ -689,6 +699,7 @@ class Instruction : public sandboxir::User {
   friend class CallInst;           // For getTopmostLLVMInstruction().
   friend class InvokeInst;         // For getTopmostLLVMInstruction().
   friend class CallBrInst;         // For getTopmostLLVMInstruction().
+  friend class LandingPadInst;     // For getTopmostLLVMInstruction().
   friend class CatchPadInst;       // For getTopmostLLVMInstruction().
   friend class CleanupPadInst;     // For getTopmostLLVMInstruction().
   friend class CatchReturnInst;    // For getTopmostLLVMInstruction().
@@ -882,6 +893,33 @@ template <typename LLVMT> class SingleLLVMInstructionImpl : public Instruction {
 #endif
 };
 
+class FenceInst : public SingleLLVMInstructionImpl<llvm::SelectInst> {
+  FenceInst(llvm::FenceInst *FI, Context &Ctx)
+      : SingleLLVMInstructionImpl(ClassID::Fence, Opcode::Fence, FI, Ctx) {}
+  friend Context; // For constructor;
+
+public:
+  static FenceInst *create(AtomicOrdering Ordering, BBIterator WhereIt,
+                           BasicBlock *WhereBB, Context &Ctx,
+                           SyncScope::ID SSID = SyncScope::System);
+  /// Returns the ordering constraint of this fence instruction.
+  AtomicOrdering getOrdering() const {
+    return cast<llvm::FenceInst>(Val)->getOrdering();
+  }
+  /// Sets the ordering constraint of this fence instruction.  May only be
+  /// Acquire, Release, AcquireRelease, or SequentiallyConsistent.
+  void setOrdering(AtomicOrdering Ordering);
+  /// Returns the synchronization scope ID of this fence instruction.
+  SyncScope::ID getSyncScopeID() const {
+    return cast<llvm::FenceInst>(Val)->getSyncScopeID();
+  }
+  /// Sets the synchronization scope ID of this fence instruction.
+  void setSyncScopeID(SyncScope::ID SSID);
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::Fence;
+  }
+};
+
 class SelectInst : public SingleLLVMInstructionImpl<llvm::SelectInst> {
   /// Use Context::createSelectInst(). Don't call the
   /// constructor directly.
@@ -1495,13 +1533,26 @@ class UnaryInstruction
 
 public:
   static bool classof(const Instruction *I) {
-    return isa<LoadInst>(I) || isa<CastInst>(I);
+    return isa<LoadInst>(I) || isa<CastInst>(I) || isa<FreezeInst>(I);
   }
   static bool classof(const Value *V) {
     return isa<Instruction>(V) && classof(cast<Instruction>(V));
   }
 };
 
+class FreezeInst : public UnaryInstruction {
+  FreezeInst(llvm::FreezeInst *FI, Context &Ctx)
+      : UnaryInstruction(ClassID::Freeze, Opcode::Freeze, FI, Ctx) {}
+  friend Context; // For constructor;
+
+public:
+  static FreezeInst *create(Value *V, BBIterator WhereIt, BasicBlock *WhereBB,
+                            Context &Ctx, const Twine &Name = "");
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::Freeze;
+  }
+};
+
 class LoadInst final : public UnaryInstruction {
   /// Use LoadInst::create() instead of calling the constructor.
   LoadInst(llvm::LoadInst *LI, Context &Ctx)
@@ -1799,8 +1850,7 @@ class InvokeInst final : public CallBase {
   BasicBlock *getUnwindDest() const;
   void setNormalDest(BasicBlock *BB);
   void setUnwindDest(BasicBlock *BB);
-  // TODO: Return a `LandingPadInst` once implemented.
-  Instruction *getLandingPadInst() const;
+  LandingPadInst *getLandingPadInst() const;
   BasicBlock *getSuccessor(unsigned SuccIdx) const;
   void setSuccessor(unsigned SuccIdx, BasicBlock *NewSucc) {
     assert(SuccIdx < 2 && "Successor # out of range for invoke!");
@@ -1858,6 +1908,50 @@ class CallBrInst final : public CallBase {
   }
 };
 
+class LandingPadInst : public SingleLLVMInstructionImpl<llvm::LandingPadInst> {
+  LandingPadInst(llvm::LandingPadInst *LP, Context &Ctx)
+      : SingleLLVMInstructionImpl(ClassID::LandingPad, Opcode::LandingPad, LP,
+                                  Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  static LandingPadInst *create(Type *RetTy, unsigned NumReservedClauses,
+                                BBIterator WhereIt, BasicBlock *WhereBB,
+                                Context &Ctx, const Twine &Name = "");
+  /// Return 'true' if this landingpad instruction is a
+  /// cleanup. I.e., it should be run when unwinding even if its landing pad
+  /// doesn't catch the exception.
+  bool isCleanup() const {
+    return cast<llvm::LandingPadInst>(Val)->isCleanup();
+  }
+  /// Indicate that this landingpad instruction is a cleanup.
+  void setCleanup(bool V);
+
+  // TODO: We are not implementing addClause() because we have no way to revert
+  // it for now.
+
+  /// Get the value of the clause at index Idx. Use isCatch/isFilter to
+  /// determine what type of clause this is.
+  Constant *getClause(unsigned Idx) const;
+
+  /// Return 'true' if the clause and index Idx is a catch clause.
+  bool isCatch(unsigned Idx) const {
+    return cast<llvm::LandingPadInst>(Val)->isCatch(Idx);
+  }
+  /// Return 'true' if the clause and index Idx is a filter clause.
+  bool isFilter(unsigned Idx) const {
+    return cast<llvm::LandingPadInst>(Val)->isFilter(Idx);
+  }
+  /// Get the number of clauses for this landing pad.
+  unsigned getNumClauses() const {
+    return cast<llvm::LandingPadInst>(Val)->getNumOperands();
+  }
+  // TODO: We are not implementing reserveClauses() because we can't revert it.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::LandingPad;
+  }
+};
+
 class FuncletPadInst : public SingleLLVMInstructionImpl<llvm::FuncletPadInst> {
   FuncletPadInst(ClassID SubclassID, Opcode Opc, llvm::Instruction *I,
                  Context &Ctx)
@@ -2282,6 +2376,7 @@ class UnaryOperator : public UnaryInstruction {
 };
 
 class BinaryOperator : public SingleLLVMInstructionImpl<llvm::BinaryOperator> {
+protected:
   static Opcode getBinOpOpcode(llvm::Instruction::BinaryOps BinOp) {
     switch (BinOp) {
     case llvm::Instruction::Add:
@@ -2361,6 +2456,22 @@ class BinaryOperator : public SingleLLVMInstructionImpl<llvm::BinaryOperator> {
   void swapOperands() { swapOperandsInternal(0, 1); }
 };
 
+/// An or instruction, which can be marked as "disjoint", indicating that the
+/// inputs don't have a 1 in the same bit position. Meaning this instruction
+/// can also be treated as an add.
+class PossiblyDisjointInst : public BinaryOperator {
+public:
+  void setIsDisjoint(bool B);
+  bool isDisjoint() const {
+    return cast<llvm::PossiblyDisjointInst>(Val)->isDisjoint();
+  }
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return isa<Instruction>(From) &&
+           cast<Instruction>(From)->getOpcode() == Opcode::Or;
+  }
+};
+
 class AtomicRMWInst : public SingleLLVMInstructionImpl<llvm::AtomicRMWInst> {
   AtomicRMWInst(llvm::AtomicRMWInst *Atomic, Context &Ctx)
       : SingleLLVMInstructionImpl(ClassID::AtomicRMW,
@@ -2652,6 +2763,28 @@ class CastInst : public UnaryInstruction {
   Type *getDestTy() const { return cast<llvm::CastInst>(Val)->getDestTy(); }
 };
 
+/// Instruction that can have a nneg flag (zext/uitofp).
+class PossiblyNonNegInst : public CastInst {
+public:
+  bool hasNonNeg() const {
+    return cast<llvm::PossiblyNonNegInst>(Val)->hasNonNeg();
+  }
+  void setNonNeg(bool B);
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    if (auto *I = dyn_cast<Instruction>(From)) {
+      switch (I->getOpcode()) {
+      case Opcode::ZExt:
+      case Opcode::UIToFP:
+        return true;
+      default:
+        return false;
+      }
+    }
+    return false;
+  }
+};
+
 // Helper class to simplify stamping out CastInst subclasses.
 template <Instruction::Opcode Op> class CastInstImpl : public CastInst {
 public:
@@ -2854,6 +2987,10 @@ class Context {
   IRBuilder<ConstantFolder> LLVMIRBuilder;
   auto &getLLVMIRBuilder() { return LLVMIRBuilder; }
 
+  FreezeInst *createFreezeInst(llvm::FreezeInst *SI);
+  friend FreezeInst; // For createFreezeInst()
+  FenceInst *createFenceInst(llvm::FenceInst *SI);
+  friend FenceInst; // For createFenceInst()
   SelectInst *createSelectInst(llvm::SelectInst *SI);
   friend SelectInst; // For createSelectInst()
   InsertElementInst *createInsertElementInst(llvm::InsertElementInst *IEI);
@@ -2876,6 +3013,8 @@ class Context {
   friend InvokeInst; // For createInvokeInst()
   CallBrInst *createCallBrInst(llvm::CallBrInst *I);
   friend CallBrInst; // For createCallBrInst()
+  LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I);
+  friend LandingPadInst; // For createLandingPadInst()
   CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I);
   friend CatchPadInst; // For createCatchPadInst()
   CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I);
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index 14cb2d72ad3af..8d79523253f23 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -37,6 +37,8 @@ DEF_USER(ConstantInt, ConstantInt)
 DEF_INSTR(Opaque,         OP(Opaque),         OpaqueInst)
 DEF_INSTR(ExtractElement, OP(ExtractElement), ExtractElementInst)
 DEF_INSTR(InsertElement,  OP(InsertElement),  InsertElementInst)
+DEF_INSTR(Freeze,         OP(Freeze),         FreezeInst)
+DEF_INSTR(Fence,          OP(Fence),          FenceInst)
 DEF_INSTR(ShuffleVector,  OP(ShuffleVector),  ShuffleVectorInst)
 DEF_INSTR(Select,         OP(Select),         SelectInst)
 DEF_INSTR(Br,             OP(Br),             BranchInst)
@@ -46,6 +48,7 @@ DEF_INSTR(Ret,            OP(Ret),            ReturnInst)
 DEF_INSTR(Call,           OP(Call),           CallInst)
 DEF_INSTR(Invoke,         OP(Invoke),         InvokeInst)
 DEF_INSTR(CallBr,         OP(CallBr),         CallBrInst)
+DEF_INSTR(LandingPad,     OP(LandingPad),     LandingPadInst)
 DEF_INSTR(CatchPad,       OP(CatchPad),       CatchPadInst)
 DEF_INSTR(CleanupPad,     OP(CleanupPad),     CleanupPadInst)
 DEF_INSTR(CatchRet,       OP(CatchRet),       CatchReturnInst)
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index e6e87eee2c6ba..a339946e67cf2 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -2328,8 +2328,8 @@ class HasReferenceResolver final : public Resolver {
   Init *resolve(Init *VarName) override;
 };
 
-void EmitDetailedRecords(RecordKeeper &RK, raw_ostream &OS);
-void EmitJSON(RecordKeeper &RK, raw_ostream &OS);
+void EmitDetailedRecords(const RecordKeeper &RK, raw_ostream &OS);
+void EmitJSON(const RecordKeeper &RK, raw_ostream &OS);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/TableGen/TableGenBackend.h b/llvm/include/llvm/TableGen/TableGenBackend.h
index 9c5a785f45a40..80134179850b0 100644
--- a/llvm/include/llvm/TableGen/TableGenBackend.h
+++ b/llvm/include/llvm/TableGen/TableGenBackend.h
@@ -13,9 +13,8 @@
 #ifndef LLVM_TABLEGEN_TABLEGENBACKEND_H
 #define LLVM_TABLEGEN_TABLEGENBACKEND_H
 
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/TableGen/Record.h"
 
 namespace llvm {
@@ -24,22 +23,19 @@ class RecordKeeper;
 class raw_ostream;
 
 namespace TableGen::Emitter {
-using FnT = void (*)(RecordKeeper &Records, raw_ostream &OS);
-
-struct OptCreatorT {
-  static void *call();
-};
-
-extern ManagedStatic<cl::opt<FnT>, OptCreatorT> Action;
+// Supports const and non-const forms of callback functions.
+using FnT = function_ref<void(RecordKeeper &Records, raw_ostream &OS)>;
 
+/// Creating an `Opt` object registers the command line option \p Name with
+/// TableGen backend and associates the callback \p CB with that option. If
+/// \p ByDefault is true, then that callback is applied by default if no
+/// command line option was specified.
 struct Opt {
-  Opt(StringRef Name, FnT CB, StringRef Desc, bool ByDefault = false) {
-    if (ByDefault)
-      Action->setInitialValue(CB);
-    Action->getParser().addLiteralOption(Name, CB, Desc);
-  }
+  Opt(StringRef Name, FnT CB, StringRef Desc, bool ByDefault = false);
 };
 
+/// Convienence wrapper around `Opt` that registers `EmitterClass::run` as the
+/// callback.
 template <class EmitterC> class OptClass : Opt {
   static void run(RecordKeeper &RK, raw_ostream &OS) { EmitterC(RK).run(OS); }
 
@@ -47,6 +43,10 @@ template <class EmitterC> class OptClass : Opt {
   OptClass(StringRef Name, StringRef Desc) : Opt(Name, run, Desc) {}
 };
 
+/// Apply callback for any command line option registered above. Returns false
+/// is no callback was applied.
+bool ApplyCallback(RecordKeeper &Records, raw_ostream &OS);
+
 } // namespace TableGen::Emitter
 
 /// emitSourceFileHeader - Output an LLVM style file header to the specified
@@ -54,6 +54,6 @@ template <class EmitterC> class OptClass : Opt {
 void emitSourceFileHeader(StringRef Desc, raw_ostream &OS,
                           const RecordKeeper &Record = RecordKeeper());
 
-} // End llvm namespace
+} // namespace llvm
 
-#endif
+#endif // LLVM_TABLEGEN_TABLEGENBACKEND_H
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 93d831c26938b..b5b969220df85 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -139,6 +139,10 @@ class FunctionImporter {
         maybeAddDeclaration(FromModule, GUID);
     }
 
+    // Return the list of source modules sorted in the ascending alphabetical
+    // order.
+    SmallVector<StringRef, 0> getSourceModules() const;
+
     const ImportMapTyImpl &getImportMap() const { return ImportMap; }
 
   private:
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
index 68eb00a50fe03..826347e79f719 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -187,7 +187,8 @@ class CodeExtractorAnalysisCache {
     /// sets, before extraction occurs. These modifications won't have any
     /// significant impact on the cost however.
     void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
-                           const ValueSet &Allocas) const;
+                           const ValueSet &Allocas,
+                           bool CollectGlobalInputs = false) const;
 
     /// Check if life time marker nodes can be hoisted/sunk into the outline
     /// region.
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index 07906fa1aa6c6..6d6ec6c7b1cc7 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -326,14 +326,6 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater(
   // with the CB BB ('Entry') between which the inlined callee will be pasted.
   Successors.insert(succ_begin(&CallSiteBB), succ_end(&CallSiteBB));
 
-  // the outcome of the inlining may be that some edges get lost (DCEd BBs
-  // because inlining brought some constant, for example). We don't know which
-  // edges will be removed, so we list all of them as potentially removable.
-  for (auto *Succ : successors(&CallSiteBB))
-    DomTreeUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
-                                const_cast<BasicBlock *>(&CallSiteBB),
-                                const_cast<BasicBlock *>(Succ));
-
   // Inlining only handles invoke and calls. If this is an invoke, and inlining
   // it pulls another invoke, the original landing pad may get split, so as to
   // share its content with other potential users. So the edge up to which we
@@ -344,11 +336,6 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater(
   if (const auto *II = dyn_cast<InvokeInst>(&CB)) {
     const auto *UnwindDest = II->getUnwindDest();
     Successors.insert(succ_begin(UnwindDest), succ_end(UnwindDest));
-    // Same idea as above, we pretend we lose all these edges.
-    for (auto *Succ : successors(UnwindDest))
-      DomTreeUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
-                                  const_cast<BasicBlock *>(UnwindDest),
-                                  const_cast<BasicBlock *>(Succ));
   }
 
   // Exclude the CallSiteBB, if it happens to be its own successor (1-BB loop).
@@ -369,30 +356,6 @@ FunctionPropertiesUpdater::FunctionPropertiesUpdater(
     FPI.updateForBB(*BB, -1);
 }
 
-DominatorTree &FunctionPropertiesUpdater::getUpdatedDominatorTree(
-    FunctionAnalysisManager &FAM) const {
-  auto &DT =
-      FAM.getResult<DominatorTreeAnalysis>(const_cast<Function &>(Caller));
-
-  SmallVector<DominatorTree::UpdateType, 2> FinalDomTreeUpdates;
-
-  for (auto &Upd : DomTreeUpdates)
-    FinalDomTreeUpdates.push_back(Upd);
-
-  DenseSet<const BasicBlock *> Inserted;
-  for (auto *Succ : successors(&CallSiteBB))
-    if (Inserted.insert(Succ).second)
-      FinalDomTreeUpdates.push_back({DominatorTree::UpdateKind::Insert,
-                                     const_cast<BasicBlock *>(&CallSiteBB),
-                                     const_cast<BasicBlock *>(Succ)});
-
-  DT.applyUpdates(FinalDomTreeUpdates);
-#ifdef EXPENSIVE_CHECKS
-  assert(DT.verify(DominatorTree::VerificationLevel::Full));
-#endif
-  return DT;
-}
-
 void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const {
   // Update feature values from the BBs that were copied from the callee, or
   // might have been modified because of inlining. The latter have been
@@ -420,7 +383,8 @@ void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const {
   // remove E.
   SetVector<const BasicBlock *> Reinclude;
   SetVector<const BasicBlock *> Unreachable;
-  auto &DT = getUpdatedDominatorTree(FAM);
+  const auto &DT =
+      FAM.getResult<DominatorTreeAnalysis>(const_cast<Function &>(Caller));
 
   if (&CallSiteBB != &*Caller.begin())
     Reinclude.insert(&*Caller.begin());
@@ -463,9 +427,6 @@ void FunctionPropertiesUpdater::finish(FunctionAnalysisManager &FAM) const {
 
   const auto &LI = FAM.getResult<LoopAnalysis>(const_cast<Function &>(Caller));
   FPI.updateAggregateStats(Caller, LI);
-#ifdef EXPENSIVE_CHECKS
-  assert(isUpdateValid(Caller, FPI, FAM));
-#endif
 }
 
 bool FunctionPropertiesUpdater::isUpdateValid(Function &F,
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 8d53a27fb75eb..980f142f11326 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1937,6 +1937,27 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst
                     << ": " << *Dist << "\n");
 
+  // Check if we can prove that Sink only accesses memory after Src's end or
+  // vice versa. At the moment this is limited to cases where either source or
+  // sink are loop invariant to avoid compile-time increases. This is not
+  // required for correctness.
+  if (SE.isLoopInvariant(Src, InnermostLoop) ||
+      SE.isLoopInvariant(Sink, InnermostLoop)) {
+    const auto &[SrcStart, SrcEnd] =
+        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds);
+    const auto &[SinkStart, SinkEnd] =
+        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds);
+    if (!isa<SCEVCouldNotCompute>(SrcStart) &&
+        !isa<SCEVCouldNotCompute>(SrcEnd) &&
+        !isa<SCEVCouldNotCompute>(SinkStart) &&
+        !isa<SCEVCouldNotCompute>(SinkEnd)) {
+      if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart))
+        return MemoryDepChecker::Dependence::NoDep;
+      if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart))
+        return MemoryDepChecker::Dependence::NoDep;
+    }
+  }
+
   // Need accesses with constant strides and the same direction for further
   // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and
   // similar code or pointer arithmetic that could wrap in the address space.
@@ -1982,45 +2003,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
                               const MemAccessInfo &B, unsigned BIdx) {
   assert(AIdx < BIdx && "Must pass arguments in program order");
 
-  // Check if we can prove that Sink only accesses memory after Src's end or
-  // vice versa. The helper is used to perform the checks only on the exit paths
-  // where it helps to improve the analysis result.
-  auto CheckCompletelyBeforeOrAfter = [&]() {
-    auto *APtr = A.getPointer();
-    auto *BPtr = B.getPointer();
-
-    Type *ATy = getLoadStoreType(InstMap[AIdx]);
-    Type *BTy = getLoadStoreType(InstMap[BIdx]);
-
-    const SCEV *Src = PSE.getSCEV(APtr);
-    const SCEV *Sink = PSE.getSCEV(BPtr);
-
-    const auto &[SrcStart, SrcEnd] =
-        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds);
-    if (isa<SCEVCouldNotCompute>(SrcStart) || isa<SCEVCouldNotCompute>(SrcEnd))
-      return false;
-
-    const auto &[SinkStart, SinkEnd] =
-        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds);
-    if (isa<SCEVCouldNotCompute>(SinkStart) ||
-        isa<SCEVCouldNotCompute>(SinkEnd))
-      return false;
-
-    auto &SE = *PSE.getSE();
-    return SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart) ||
-           SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart);
-  };
-
   // Get the dependence distance, stride, type size and what access writes for
   // the dependence between A and B.
   auto Res =
       getDependenceDistanceStrideAndSize(A, InstMap[AIdx], B, InstMap[BIdx]);
-  if (std::holds_alternative<Dependence::DepType>(Res)) {
-    if (std::get<Dependence::DepType>(Res) == Dependence::Unknown &&
-        CheckCompletelyBeforeOrAfter())
-      return Dependence::NoDep;
+  if (std::holds_alternative<Dependence::DepType>(Res))
     return std::get<Dependence::DepType>(Res);
-  }
 
   auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
       std::get<DepDistanceStrideAndSizeInfo>(Res);
@@ -2029,9 +2017,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   std::optional<uint64_t> CommonStride =
       StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt;
   if (isa<SCEVCouldNotCompute>(Dist)) {
-    if (CheckCompletelyBeforeOrAfter())
-      return Dependence::NoDep;
-
     // TODO: Relax requirement that there is a common stride to retry with
     // non-constant distance dependencies.
     FoundNonConstantDistanceDependence |= CommonStride.has_value();
@@ -2083,8 +2068,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
         // Write to the same location with the same size.
         return Dependence::Forward;
       }
-      assert(!CheckCompletelyBeforeOrAfter() &&
-             "unexpectedly proved no dependence");
       LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
                            "different type sizes\n");
       return Dependence::Unknown;
@@ -2106,8 +2089,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
         // did not set it when strides were different but there is no inherent
         // reason to.
         FoundNonConstantDistanceDependence |= CommonStride.has_value();
-        if (CheckCompletelyBeforeOrAfter())
-          return Dependence::NoDep;
         return Dependence::Unknown;
       }
       if (!HasSameSize ||
@@ -2127,9 +2108,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // Below we only handle strictly positive distances.
   if (MinDistance <= 0) {
     FoundNonConstantDistanceDependence |= CommonStride.has_value();
-    if (CheckCompletelyBeforeOrAfter())
-      return Dependence::NoDep;
-
     return Dependence::Unknown;
   }
 
@@ -2146,18 +2124,13 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   }
 
   if (!HasSameSize) {
-    if (CheckCompletelyBeforeOrAfter())
-      return Dependence::NoDep;
     LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
                          "different type sizes\n");
     return Dependence::Unknown;
   }
 
-  if (!CommonStride) {
-    if (CheckCompletelyBeforeOrAfter())
-      return Dependence::NoDep;
+  if (!CommonStride)
     return Dependence::Unknown;
-  }
 
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
@@ -2205,10 +2178,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       // dependence distance and the distance may be larger at runtime (and safe
       // for vectorization). Classify it as Unknown, so we re-try with runtime
       // checks.
-      //
-      if (CheckCompletelyBeforeOrAfter())
-        return Dependence::NoDep;
-
       return Dependence::Unknown;
     }
     LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance "
@@ -2221,8 +2190,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   if (MinDistanceNeeded > MinDepDistBytes) {
     LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
                       << MinDistanceNeeded << " size in bytes\n");
-    assert(!CheckCompletelyBeforeOrAfter() &&
-           "unexpectedly proved no dependence");
     return Dependence::Backward;
   }
 
@@ -2270,8 +2237,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     // For non-constant distances, we checked the lower bound of the dependence
     // distance and the distance may be larger at runtime (and safe for
     // vectorization). Classify it as Unknown, so we re-try with runtime checks.
-    assert(!CheckCompletelyBeforeOrAfter() &&
-           "unexpectedly proved no dependence");
     return Dependence::Unknown;
   }
 
diff --git a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
index 7b6ca4d711fcd..8824cec86924f 100644
--- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -58,13 +58,13 @@ bool UnrolledInstAnalyzer::simplifyInstWithSCEV(Instruction *I) {
   auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(S));
   if (!Base)
     return false;
-  auto *Offset =
-      dyn_cast<SCEVConstant>(SE.getMinusSCEV(ValueAtIteration, Base));
+  std::optional<APInt> Offset =
+      SE.computeConstantDifference(ValueAtIteration, Base);
   if (!Offset)
     return false;
   SimplifiedAddress Address;
   Address.Base = Base->getValue();
-  Address.Offset = Offset->getValue();
+  Address.Offset = *Offset;
   SimplifiedAddresses[I] = Address;
   return false;
 }
@@ -105,7 +105,7 @@ bool UnrolledInstAnalyzer::visitLoad(LoadInst &I) {
   auto AddressIt = SimplifiedAddresses.find(AddrOp);
   if (AddressIt == SimplifiedAddresses.end())
     return false;
-  ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset;
+  const APInt &SimplifiedAddrOp = AddressIt->second.Offset;
 
   auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base);
   // We're only interested in loads that can be completely folded to a
@@ -125,9 +125,9 @@ bool UnrolledInstAnalyzer::visitLoad(LoadInst &I) {
     return false;
 
   unsigned ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
-  if (SimplifiedAddrOp->getValue().getActiveBits() > 64)
+  if (SimplifiedAddrOp.getActiveBits() > 64)
     return false;
-  int64_t SimplifiedAddrOpV = SimplifiedAddrOp->getSExtValue();
+  int64_t SimplifiedAddrOpV = SimplifiedAddrOp.getSExtValue();
   if (SimplifiedAddrOpV < 0) {
     // FIXME: For now we conservatively ignore out of bound accesses, but
     // we're allowed to perform the optimization in this case.
@@ -186,10 +186,9 @@ bool UnrolledInstAnalyzer::visitCmpInst(CmpInst &I) {
       if (SimplifiedRHS != SimplifiedAddresses.end()) {
         SimplifiedAddress &LHSAddr = SimplifiedLHS->second;
         SimplifiedAddress &RHSAddr = SimplifiedRHS->second;
-        if (LHSAddr.Base == RHSAddr.Base) {
-          LHS = LHSAddr.Offset;
-          RHS = RHSAddr.Offset;
-        }
+        if (LHSAddr.Base == RHSAddr.Base)
+          return ICmpInst::compare(LHSAddr.Offset, RHSAddr.Offset,
+                                   I.getPredicate());
       }
     }
   }
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 8bb5efcf1b2ec..b59aa4810005b 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -288,6 +288,7 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
   {
     PreservedAnalyses PA = PreservedAnalyses::all();
     PA.abandon<FunctionPropertiesAnalysis>();
+    PA.abandon<DominatorTreeAnalysis>();
     PA.abandon<LoopAnalysis>();
     FAM.invalidate(*Caller, PA);
   }
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 21a1c74eefc07..54dde8401cdff 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8247,9 +8247,8 @@ const SCEV *ScalarEvolution::getExitCount(const Loop *L,
   llvm_unreachable("Invalid ExitCountKind!");
 }
 
-const SCEV *
-ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L,
-                                                 SmallVector<const SCEVPredicate *, 4> &Preds) {
+const SCEV *ScalarEvolution::getPredicatedBackedgeTakenCount(
+    const Loop *L, SmallVectorImpl<const SCEVPredicate *> &Preds) {
   return getPredicatedBackedgeTakenInfo(L).getExact(L, this, &Preds);
 }
 
@@ -8267,7 +8266,7 @@ const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L,
 }
 
 const SCEV *ScalarEvolution::getPredicatedSymbolicMaxBackedgeTakenCount(
-    const Loop *L, SmallVector<const SCEVPredicate *, 4> &Preds) {
+    const Loop *L, SmallVectorImpl<const SCEVPredicate *> &Preds) {
   return getPredicatedBackedgeTakenInfo(L).getSymbolicMax(L, this, &Preds);
 }
 
@@ -8537,9 +8536,9 @@ void ScalarEvolution::forgetBlockAndLoopDispositions(Value *V) {
 /// is never skipped. This is a valid assumption as long as the loop exits via
 /// that test. For precise results, it is the caller's responsibility to specify
 /// the relevant loop exiting block using getExact(ExitingBlock, SE).
-const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
-                                             SmallVector<const SCEVPredicate *, 4> *Preds) const {
+const SCEV *ScalarEvolution::BackedgeTakenInfo::getExact(
+    const Loop *L, ScalarEvolution *SE,
+    SmallVectorImpl<const SCEVPredicate *> *Preds) const {
   // If any exits were not computable, the loop is not computable.
   if (!isComplete() || ExitNotTaken.empty())
     return SE->getCouldNotCompute();
@@ -8622,7 +8621,7 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
 
 const SCEV *ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(
     const Loop *L, ScalarEvolution *SE,
-    SmallVector<const SCEVPredicate *, 4> *Predicates) {
+    SmallVectorImpl<const SCEVPredicate *> *Predicates) {
   if (!SymbolicMax) {
     // Form an expression for the maximum exit count possible for this loop. We
     // merge the max and exact information to approximate a version of
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index d4dbab04e8ecd..6767c7ca288bc 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2093,6 +2093,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoSanitizeBounds;
   case bitc::ATTR_KIND_NO_SANITIZE_COVERAGE:
     return Attribute::NoSanitizeCoverage;
+  case bitc::ATTR_KIND_NO_SANITIZE_REALTIME:
+    return Attribute::NoSanitizeRealtime;
   case bitc::ATTR_KIND_NULL_POINTER_IS_VALID:
     return Attribute::NullPointerIsValid;
   case bitc::ATTR_KIND_OPTIMIZE_FOR_DEBUGGING:
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 2f3e90d6e3821..d21ff10d51d00 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -795,6 +795,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_SANITIZE_BOUNDS;
   case Attribute::NoSanitizeCoverage:
     return bitc::ATTR_KIND_NO_SANITIZE_COVERAGE;
+  case llvm::Attribute::NoSanitizeRealtime:
+    return bitc::ATTR_KIND_NO_SANITIZE_REALTIME;
   case Attribute::NullPointerIsValid:
     return bitc::ATTR_KIND_NULL_POINTER_IS_VALID;
   case Attribute::OptimizeForDebugging:
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index bccd9b04cd2c5..e40248197c7c7 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -402,8 +402,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
 
   // Scan the register defs for this instruction and update
   // live-ranges.
-  for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isReg() || !MO.isDef()) continue;
+  for (const MachineOperand &MO : MI.all_defs()) {
     Register Reg = MO.getReg();
     if (Reg == 0) continue;
     // Ignore KILLs and passthru registers for liveness...
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 0f2acdb12389d..f21910ee3a444 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -2125,19 +2125,15 @@ bool MachineInstr::addRegisterDead(Register Reg,
 }
 
 void MachineInstr::clearRegisterDeads(Register Reg) {
-  for (MachineOperand &MO : operands()) {
-    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
-      continue;
-    MO.setIsDead(false);
-  }
+  for (MachineOperand &MO : all_defs())
+    if (MO.getReg() == Reg)
+      MO.setIsDead(false);
 }
 
 void MachineInstr::setRegisterDefReadUndef(Register Reg, bool IsUndef) {
-  for (MachineOperand &MO : operands()) {
-    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0)
-      continue;
-    MO.setIsUndef(IsUndef);
-  }
+  for (MachineOperand &MO : all_defs())
+    if (MO.getReg() == Reg && MO.getSubReg() != 0)
+      MO.setIsUndef(IsUndef);
 }
 
 void MachineInstr::addRegisterDefined(Register Reg,
@@ -2147,9 +2143,8 @@ void MachineInstr::addRegisterDefined(Register Reg,
     if (MO)
       return;
   } else {
-    for (const MachineOperand &MO : operands()) {
-      if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
-          MO.getSubReg() == 0)
+    for (const MachineOperand &MO : all_defs()) {
+      if (MO.getReg() == Reg && MO.getSubReg() == 0)
         return;
     }
   }
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 78201d9bfb79a..99c82bc3a2660 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -2667,8 +2667,8 @@ void ModuloScheduleExpanderMVE::calcNumUnroll() {
 void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
                                                ValueMapTy &VRMap,
                                                bool LastDef) {
-  for (MachineOperand &MO : NewMI->operands()) {
-    if (!MO.isReg() || !MO.getReg().isVirtual() || !MO.isDef())
+  for (MachineOperand &MO : NewMI->all_defs()) {
+    if (!MO.getReg().isVirtual())
       continue;
     Register Reg = MO.getReg();
     const TargetRegisterClass *RC = MRI.getRegClass(Reg);
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 62f7ed29c8c81..6babd5a3f1f96 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -1329,9 +1329,8 @@ void RegAllocFastImpl::findAndSortDefOperandIndexes(const MachineInstr &MI) {
   // we assign these.
   SmallVector<unsigned> RegClassDefCounts(TRI->getNumRegClasses(), 0);
 
-  for (const MachineOperand &MO : MI.operands())
-    if (MO.isReg() && MO.isDef())
-      addRegClassDefCounts(RegClassDefCounts, MO.getReg());
+  for (const MachineOperand &MO : MI.all_defs())
+    addRegClassDefCounts(RegClassDefCounts, MO.getReg());
 
   llvm::sort(DefOperandIndexes, [&](unsigned I0, unsigned I1) {
     const MachineOperand &MO0 = MI.getOperand(I0);
@@ -1481,9 +1480,7 @@ void RegAllocFastImpl::allocateInstruction(MachineInstr &MI) {
         // Assign virtual register defs.
         while (ReArrangedImplicitOps) {
           ReArrangedImplicitOps = false;
-          for (MachineOperand &MO : MI.operands()) {
-            if (!MO.isReg() || !MO.isDef())
-              continue;
+          for (MachineOperand &MO : MI.all_defs()) {
             Register Reg = MO.getReg();
             if (Reg.isVirtual()) {
               ReArrangedImplicitOps =
@@ -1499,10 +1496,7 @@ void RegAllocFastImpl::allocateInstruction(MachineInstr &MI) {
     // Free registers occupied by defs.
     // Iterate operands in reverse order, so we see the implicit super register
     // defs first (we added them earlier in case of <def,read-undef>).
-    for (MachineOperand &MO : reverse(MI.operands())) {
-      if (!MO.isReg() || !MO.isDef())
-        continue;
-
+    for (MachineOperand &MO : reverse(MI.all_defs())) {
       Register Reg = MO.getReg();
 
       // subreg defs don't free the full register. We left the subreg number
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index f6c53f3051c2f..97f8346df0e8f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -3230,8 +3230,8 @@ void JoinVals::pruneValues(JoinVals &Other,
           // Also remove dead flags since the joined live range will
           // continue past this instruction.
           for (MachineOperand &MO :
-               Indexes->getInstructionFromIndex(Def)->operands()) {
-            if (MO.isReg() && MO.isDef() && MO.getReg() == Reg) {
+               Indexes->getInstructionFromIndex(Def)->all_defs()) {
+            if (MO.getReg() == Reg) {
               if (MO.getSubReg() != 0 && MO.isUndef() && !EraseImpDef)
                 MO.setIsUndef(false);
               MO.setIsDead(false);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 1a98fbd7589fb..b13a2df7b48eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -439,8 +439,8 @@ class SelectionDAGBuilder {
     /// The set of gc.relocate calls associated with this gc.statepoint.
     SmallVector<const GCRelocateInst *, 16> GCRelocates;
 
-    /// The full list of gc arguments to the gc.statepoint being lowered.
-    ArrayRef<const Use> GCArgs;
+    /// The full list of gc-live arguments to the gc.statepoint being lowered.
+    ArrayRef<const Use> GCLives;
 
     /// The gc.statepoint instruction.
     const Instruction *StatepointInstr = nullptr;
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 4268da8670d50..a1f87d2c62573 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -673,7 +673,7 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // it is the contents of the slot which may get updated, not the pointer to
   // the alloca
   SmallVector<SDValue, 4> Allocas;
-  for (Value *V : SI.GCArgs) {
+  for (Value *V : SI.GCLives) {
     SDValue Incoming = Builder.getValue(V);
     if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
       // This handles allocas as arguments to the statepoint
@@ -1086,7 +1086,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
     }
   }
 
-  SI.GCArgs = ArrayRef<const Use>(I.gc_args_begin(), I.gc_args_end());
+  SI.GCLives = ArrayRef<const Use>(I.gc_live_begin(), I.gc_live_end());
   SI.StatepointInstr = &I;
   SI.ID = I.getID();
 
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index c2b8c39662bb6..38bd0b0ba4114 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -823,7 +823,9 @@ void TargetInstrInfo::lowerCopy(MachineInstr *MI,
   }
 
   copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(), DstMO.getReg(),
-              SrcMO.getReg(), SrcMO.isKill());
+              SrcMO.getReg(), SrcMO.isKill(),
+              DstMO.getReg().isPhysical() ? DstMO.isRenamable() : false,
+              SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false);
 
   if (MI->getNumOperands() > 2)
     transferImplicitOperands(MI, TRI);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 70a6e74b94d55..532313a31fc13 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1548,7 +1548,16 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
   BasicBlock *CommonExit = nullptr;
   SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
   Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
-  Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+  Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
+                              /*CollectGlobalInputs=*/true);
+
+  Inputs.remove_if([&](Value *I) {
+    if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
+      return GV->getValueType() == OpenMPIRBuilder::Ident;
+
+    return false;
+  });
 
   LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
 
diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h
index b9441729b48c6..2f1c7b85e6650 100644
--- a/llvm/lib/IR/AttributeImpl.h
+++ b/llvm/lib/IR/AttributeImpl.h
@@ -275,7 +275,7 @@ class ConstantRangeListAttributeImpl final
 
 class AttributeBitSet {
   /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint8_t AvailableAttrs[12] = {};
+  uint8_t AvailableAttrs[16] = {};
   static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
                 "Too many attributes");
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index ac754b5d9638d..5ff1f3dfb0dc9 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2223,6 +2223,12 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
           "Attributes 'optdebug and optnone' are incompatible!", V);
   }
 
+  Check(!(Attrs.hasFnAttr(Attribute::SanitizeRealtime) &&
+          Attrs.hasFnAttr(Attribute::NoSanitizeRealtime)),
+        "Attributes "
+        "'sanitize_realtime and nosanitize_realtime' are incompatible!",
+        V);
+
   if (Attrs.hasFnAttr(Attribute::OptimizeForDebugging)) {
     Check(!Attrs.hasFnAttr(Attribute::OptimizeForSize),
           "Attributes 'optsize and optdebug' are incompatible!", V);
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index af09f2a0ee0cd..66e52fe2d08f8 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -5784,7 +5784,7 @@ bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
                                                   : A[0][0].getString();
   for (std::size_t I = 0, End = Values.size(); I != End; ++I) {
     MCAsmMacroArgument Arg;
-    Arg.emplace_back(AsmToken::Identifier, Values.slice(I, I + 1));
+    Arg.emplace_back(AsmToken::Identifier, Values.substr(I, 1));
 
     // Note that the AtPseudoVariable is enabled for instantiations of .irpc.
     // This is undocumented, but GAS seems to support it.
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index f64b7f62d61d0..9f619c5018b50 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -7125,7 +7125,7 @@ bool MasmParser::parseDirectiveForc(SMLoc DirectiveLoc, StringRef Directive) {
   StringRef Values(Argument);
   for (std::size_t I = 0, End = Values.size(); I != End; ++I) {
     MCAsmMacroArgument Arg;
-    Arg.emplace_back(AsmToken::Identifier, Values.slice(I, I + 1));
+    Arg.emplace_back(AsmToken::Identifier, Values.substr(I, 1));
 
     if (expandMacro(OS, M->Body, Parameter, Arg, M->Locals, getTok().getLoc()))
       return true;
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 3f6f605149b47..90d7588407068 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
@@ -48,6 +49,8 @@ static const MCExpr *buildSymbolDiff(MCObjectStreamer *MCOS, const MCSymbol *A,
   return AddrDelta;
 }
 
+uint64_t MCDecodedPseudoProbe::getGuid() const { return InlineTree->Guid; }
+
 void MCPseudoProbe::emit(MCObjectStreamer *MCOS,
                          const MCPseudoProbe *LastProbe) const {
   bool IsSentinel = isSentinelProbe(getAttributes());
@@ -271,7 +274,7 @@ static StringRef getProbeFNameForGUID(const GUIDProbeFunctionMap &GUID2FuncMAP,
   auto It = GUID2FuncMAP.find(GUID);
   assert(It != GUID2FuncMAP.end() &&
          "Probe function must exist for a valid GUID");
-  return It->second.FuncName;
+  return It->FuncName;
 }
 
 void MCPseudoProbeFuncDesc::print(raw_ostream &OS) {
@@ -288,8 +291,8 @@ void MCDecodedPseudoProbe::getInlineContext(
   // Note that it won't include the probe's belonging function(leaf location)
   while (Cur->hasInlineSite()) {
     StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Cur->Parent->Guid);
-    ContextStack.emplace_back(
-        MCPseudoProbeFrameLocation(FuncName, std::get<1>(Cur->ISite)));
+    ContextStack.emplace_back(MCPseudoProbeFrameLocation(
+        FuncName, std::get<1>(Cur->getInlineSite())));
     Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
   }
   // Make the ContextStack in caller-callee order
@@ -317,10 +320,10 @@ void MCDecodedPseudoProbe::print(raw_ostream &OS,
                                  bool ShowName) const {
   OS << "FUNC: ";
   if (ShowName) {
-    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, Guid);
+    StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, getGuid());
     OS << FuncName.str() << " ";
   } else {
-    OS << Guid << " ";
+    OS << getGuid() << " ";
   }
   OS << "Index: " << Index << "  ";
   if (Discriminator)
@@ -387,59 +390,68 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
   Data = Start;
   End = Data + Size;
 
+  uint32_t FuncDescCount = 0;
   while (Data < End) {
-    auto ErrorOrGUID = readUnencodedNumber<uint64_t>();
-    if (!ErrorOrGUID)
+    // GUID
+    if (!readUnencodedNumber<uint64_t>())
       return false;
-
-    auto ErrorOrHash = readUnencodedNumber<uint64_t>();
-    if (!ErrorOrHash)
+    // Hash
+    if (!readUnencodedNumber<uint64_t>())
       return false;
 
     auto ErrorOrNameSize = readUnsignedNumber<uint32_t>();
     if (!ErrorOrNameSize)
       return false;
-    uint32_t NameSize = std::move(*ErrorOrNameSize);
-
-    auto ErrorOrName = readString(NameSize);
-    if (!ErrorOrName)
+    // Function name
+    if (!readString(*ErrorOrNameSize))
       return false;
+    ++FuncDescCount;
+  }
+  assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
+  GUID2FuncDescMap.reserve(FuncDescCount);
 
-    uint64_t GUID = std::move(*ErrorOrGUID);
-    uint64_t Hash = std::move(*ErrorOrHash);
-    StringRef Name = std::move(*ErrorOrName);
+  Data = Start;
+  End = Data + Size;
+  while (Data < End) {
+    uint64_t GUID =
+        cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
+    uint64_t Hash =
+        cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
+    uint32_t NameSize =
+        cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
+    StringRef Name = cantFail(errorOrToExpected(readString(NameSize)));
 
     // Initialize PseudoProbeFuncDesc and populate it into GUID2FuncDescMap
-    GUID2FuncDescMap.emplace(GUID, MCPseudoProbeFuncDesc(GUID, Hash, Name));
+    GUID2FuncDescMap.emplace_back(GUID, Hash, Name.copy(FuncNameAllocator));
   }
   assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
+  assert(GUID2FuncDescMap.size() == FuncDescCount &&
+         "Mismatching function description count pre- and post-parsing");
+  llvm::sort(GUID2FuncDescMap, [](const auto &LHS, const auto &RHS) {
+    return LHS.FuncGUID < RHS.FuncGUID;
+  });
   return true;
 }
 
+template <bool IsTopLevelFunc>
 bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     MCDecodedPseudoProbeInlineTree *Cur, uint64_t &LastAddr,
-    const Uint64Set &GuidFilter, const Uint64Map &FuncStartAddrs) {
+    const Uint64Set &GuidFilter, const Uint64Map &FuncStartAddrs,
+    const uint32_t CurChildIndex) {
   // The pseudo_probe section encodes an inline forest and each tree has a
   // format defined in MCPseudoProbe.h
 
   uint32_t Index = 0;
-  bool IsTopLevelFunc = Cur == &DummyInlineRoot;
   if (IsTopLevelFunc) {
     // Use a sequential id for top level inliner.
-    Index = Cur->getChildren().size();
+    Index = CurChildIndex;
   } else {
     // Read inline site for inlinees
-    auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
-    if (!ErrorOrIndex)
-      return false;
-    Index = std::move(*ErrorOrIndex);
+    Index = cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
   }
 
   // Read guid
-  auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
-  if (!ErrorOrCurGuid)
-    return false;
-  uint64_t Guid = std::move(*ErrorOrCurGuid);
+  uint64_t Guid = cantFail(errorOrToExpected(readUnencodedNumber<uint64_t>()));
 
   // Decide if top-level node should be disgarded.
   if (IsTopLevelFunc && !GuidFilter.empty() && !GuidFilter.count(Guid))
@@ -448,8 +460,9 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   // If the incoming node is null, all its children nodes should be disgarded.
   if (Cur) {
     // Switch/add to a new tree node(inlinee)
-    Cur = Cur->getOrAddNode(std::make_tuple(Guid, Index));
-    Cur->Guid = Guid;
+    Cur->getChildren()[CurChildIndex] =
+        MCDecodedPseudoProbeInlineTree(InlineSite(Guid, Index), Cur);
+    Cur = &Cur->getChildren()[CurChildIndex];
     if (IsTopLevelFunc && !EncodingIsAddrBased) {
       if (auto V = FuncStartAddrs.lookup(Guid))
         LastAddr = V;
@@ -457,41 +470,28 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
   }
 
   // Read number of probes in the current node.
-  auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
-  if (!ErrorOrNodeCount)
-    return false;
-  uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+  uint32_t NodeCount =
+      cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
+  uint32_t CurrentProbeCount = 0;
   // Read number of direct inlinees
-  auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
-  if (!ErrorOrCurChildrenToProcess)
-    return false;
+  uint32_t ChildrenToProcess =
+      cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
   // Read all probes in this node
   for (std::size_t I = 0; I < NodeCount; I++) {
     // Read index
-    auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
-    if (!ErrorOrIndex)
-      return false;
-    uint32_t Index = std::move(*ErrorOrIndex);
+    uint32_t Index =
+        cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
     // Read type | flag.
-    auto ErrorOrValue = readUnencodedNumber<uint8_t>();
-    if (!ErrorOrValue)
-      return false;
-    uint8_t Value = std::move(*ErrorOrValue);
+    uint8_t Value = cantFail(errorOrToExpected(readUnencodedNumber<uint8_t>()));
     uint8_t Kind = Value & 0xf;
     uint8_t Attr = (Value & 0x70) >> 4;
     // Read address
     uint64_t Addr = 0;
     if (Value & 0x80) {
-      auto ErrorOrOffset = readSignedNumber<int64_t>();
-      if (!ErrorOrOffset)
-        return false;
-      int64_t Offset = std::move(*ErrorOrOffset);
+      int64_t Offset = cantFail(errorOrToExpected(readSignedNumber<int64_t>()));
       Addr = LastAddr + Offset;
     } else {
-      auto ErrorOrAddr = readUnencodedNumber<int64_t>();
-      if (!ErrorOrAddr)
-        return false;
-      Addr = std::move(*ErrorOrAddr);
+      Addr = cantFail(errorOrToExpected(readUnencodedNumber<int64_t>()));
       if (isSentinelProbe(Attr)) {
         // For sentinel probe, the addr field actually stores the GUID of the
         // split function. Convert it to the real address.
@@ -508,85 +508,189 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
 
     uint32_t Discriminator = 0;
     if (hasDiscriminator(Attr)) {
-      auto ErrorOrDiscriminator = readUnsignedNumber<uint32_t>();
-      if (!ErrorOrDiscriminator)
-        return false;
-      Discriminator = std::move(*ErrorOrDiscriminator);
+      Discriminator =
+          cantFail(errorOrToExpected(readUnsignedNumber<uint32_t>()));
     }
 
     if (Cur && !isSentinelProbe(Attr)) {
-      // Populate Address2ProbesMap
-      auto &Probes = Address2ProbesMap[Addr];
-      Probes.emplace_back(Addr, Cur->Guid, Index, PseudoProbeType(Kind), Attr,
-                          Discriminator, Cur);
-      Cur->addProbes(&Probes.back());
+      PseudoProbeVec.emplace_back(Addr, Index, PseudoProbeType(Kind), Attr,
+                                  Discriminator, Cur);
+      ++CurrentProbeCount;
     }
     LastAddr = Addr;
   }
 
-  uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
+  if (Cur) {
+    Cur->setProbes(
+        MutableArrayRef(PseudoProbeVec).take_back(CurrentProbeCount));
+    InlineTreeVec.resize(InlineTreeVec.size() + ChildrenToProcess);
+    Cur->getChildren() =
+        MutableArrayRef(InlineTreeVec).take_back(ChildrenToProcess);
+  }
   for (uint32_t I = 0; I < ChildrenToProcess; I++) {
-    buildAddress2ProbeMap(Cur, LastAddr, GuidFilter, FuncStartAddrs);
+    buildAddress2ProbeMap<false>(Cur, LastAddr, GuidFilter, FuncStartAddrs, I);
+  }
+  return Cur;
+}
+
+template <bool IsTopLevelFunc>
+bool MCPseudoProbeDecoder::countRecords(bool &Discard, uint32_t &ProbeCount,
+                                        uint32_t &InlinedCount,
+                                        const Uint64Set &GuidFilter) {
+  if (!IsTopLevelFunc)
+    // Read inline site for inlinees
+    if (!readUnsignedNumber<uint32_t>())
+      return false;
+
+  // Read guid
+  auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
+  if (!ErrorOrCurGuid)
+    return false;
+  uint64_t Guid = std::move(*ErrorOrCurGuid);
+
+  // Decide if top-level node should be disgarded.
+  if (IsTopLevelFunc) {
+    Discard = !GuidFilter.empty() && !GuidFilter.count(Guid);
+    if (!Discard)
+      // Allocate an entry for top-level function record.
+      ++InlinedCount;
+  }
+
+  // Read number of probes in the current node.
+  auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
+  if (!ErrorOrNodeCount)
+    return false;
+  uint32_t NodeCount = std::move(*ErrorOrNodeCount);
+  uint32_t CurrentProbeCount = 0;
+
+  // Read number of direct inlinees
+  auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
+  if (!ErrorOrCurChildrenToProcess)
+    return false;
+  uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
+
+  // Read all probes in this node
+  for (std::size_t I = 0; I < NodeCount; I++) {
+    // Read index
+    if (!readUnsignedNumber<uint32_t>())
+      return false;
+
+    // Read type | flag.
+    auto ErrorOrValue = readUnencodedNumber<uint8_t>();
+    if (!ErrorOrValue)
+      return false;
+    uint8_t Value = std::move(*ErrorOrValue);
+
+    uint8_t Attr = (Value & 0x70) >> 4;
+    if (Value & 0x80) {
+      // Offset
+      if (!readSignedNumber<int64_t>())
+        return false;
+    } else {
+      // Addr
+      if (!readUnencodedNumber<int64_t>())
+        return false;
+    }
+
+    if (hasDiscriminator(Attr))
+      // Discriminator
+      if (!readUnsignedNumber<uint32_t>())
+        return false;
+
+    if (!Discard && !isSentinelProbe(Attr))
+      ++CurrentProbeCount;
+  }
+
+  if (!Discard) {
+    ProbeCount += CurrentProbeCount;
+    InlinedCount += ChildrenToProcess;
   }
 
+  for (uint32_t I = 0; I < ChildrenToProcess; I++)
+    if (!countRecords<false>(Discard, ProbeCount, InlinedCount, GuidFilter))
+      return false;
   return true;
 }
 
 bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
     const uint8_t *Start, std::size_t Size, const Uint64Set &GuidFilter,
     const Uint64Map &FuncStartAddrs) {
+  // For function records in the order of their appearance in the encoded data
+  // (DFS), count the number of contained probes and inlined function records.
+  uint32_t ProbeCount = 0;
+  uint32_t InlinedCount = 0;
+  uint32_t TopLevelFuncs = 0;
+  Data = Start;
+  End = Data + Size;
+  bool Discard = false;
+  while (Data < End) {
+    if (!countRecords<true>(Discard, ProbeCount, InlinedCount, GuidFilter))
+      return false;
+    TopLevelFuncs += !Discard;
+  }
+  assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  PseudoProbeVec.reserve(ProbeCount);
+  InlineTreeVec.reserve(InlinedCount);
+
+  // Allocate top-level function records as children of DummyInlineRoot.
+  InlineTreeVec.resize(TopLevelFuncs);
+  DummyInlineRoot.getChildren() = MutableArrayRef(InlineTreeVec);
+
   Data = Start;
   End = Data + Size;
   uint64_t LastAddr = 0;
+  uint32_t CurChildIndex = 0;
   while (Data < End)
-    buildAddress2ProbeMap(&DummyInlineRoot, LastAddr, GuidFilter,
-                          FuncStartAddrs);
+    CurChildIndex += buildAddress2ProbeMap<true>(
+        &DummyInlineRoot, LastAddr, GuidFilter, FuncStartAddrs, CurChildIndex);
   assert(Data == End && "Have unprocessed data in pseudo_probe section");
+  assert(PseudoProbeVec.size() == ProbeCount &&
+         "Mismatching probe count pre- and post-parsing");
+  assert(InlineTreeVec.size() == InlinedCount &&
+         "Mismatching function records count pre- and post-parsing");
+
+  std::vector<std::pair<uint64_t, uint32_t>> SortedA2P(ProbeCount);
+  for (const auto &[I, Probe] : llvm::enumerate(PseudoProbeVec))
+    SortedA2P[I] = {Probe.getAddress(), I};
+  llvm::sort(SortedA2P);
+  Address2ProbesMap.reserve(ProbeCount);
+  for (const uint32_t I : llvm::make_second_range(SortedA2P))
+    Address2ProbesMap.emplace_back(PseudoProbeVec[I]);
+  SortedA2P.clear();
   return true;
 }
 
 void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
   OS << "Pseudo Probe Desc:\n";
-  // Make the output deterministic
-  std::map<uint64_t, MCPseudoProbeFuncDesc> OrderedMap(GUID2FuncDescMap.begin(),
-                                                       GUID2FuncDescMap.end());
-  for (auto &I : OrderedMap) {
-    I.second.print(OS);
-  }
+  for (auto &I : GUID2FuncDescMap)
+    I.print(OS);
 }
 
 void MCPseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
                                                 uint64_t Address) {
-  auto It = Address2ProbesMap.find(Address);
-  if (It != Address2ProbesMap.end()) {
-    for (const MCDecodedPseudoProbe &Probe : It->second) {
-      OS << " [Probe]:\t";
-      Probe.print(OS, GUID2FuncDescMap, true);
-    }
+  for (const MCDecodedPseudoProbe &Probe : Address2ProbesMap.find(Address)) {
+    OS << " [Probe]:\t";
+    Probe.print(OS, GUID2FuncDescMap, true);
   }
 }
 
 void MCPseudoProbeDecoder::printProbesForAllAddresses(raw_ostream &OS) {
-  auto Entries = make_first_range(Address2ProbesMap);
-  SmallVector<uint64_t, 0> Addresses(Entries.begin(), Entries.end());
-  llvm::sort(Addresses);
-  for (auto K : Addresses) {
-    OS << "Address:\t";
-    OS << K;
-    OS << "\n";
-    printProbeForAddress(OS, K);
+  uint64_t PrevAddress = INT64_MAX;
+  for (MCDecodedPseudoProbe &Probe : Address2ProbesMap) {
+    uint64_t Address = Probe.getAddress();
+    if (Address != PrevAddress) {
+      PrevAddress = Address;
+      OS << "Address:\t" << Address << '\n';
+    }
+    OS << " [Probe]:\t";
+    Probe.print(OS, GUID2FuncDescMap, true);
   }
 }
 
 const MCDecodedPseudoProbe *
 MCPseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
-  auto It = Address2ProbesMap.find(Address);
-  if (It == Address2ProbesMap.end())
-    return nullptr;
-  const auto &Probes = It->second;
-
   const MCDecodedPseudoProbe *CallProbe = nullptr;
-  for (const MCDecodedPseudoProbe &Probe : Probes) {
+  for (const MCDecodedPseudoProbe &Probe : Address2ProbesMap.find(Address)) {
     if (Probe.isCall()) {
       // Disabling the assert and returning first call probe seen so far.
       // Subsequent call probes, if any, are ignored. Due to the the way
@@ -611,7 +715,7 @@ const MCPseudoProbeFuncDesc *
 MCPseudoProbeDecoder::getFuncDescForGUID(uint64_t GUID) const {
   auto It = GUID2FuncDescMap.find(GUID);
   assert(It != GUID2FuncDescMap.end() && "Function descriptor doesn't exist");
-  return &It->second;
+  return &*It;
 }
 
 void MCPseudoProbeDecoder::getInlineContextForProbe(
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 32b20d758ee70..c4d88856abdfb 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -799,7 +799,7 @@ InstrBuilder::createInstruction(const MCInst &MCI,
   unsigned WriteIndex = 0;
   Idx = 0U;
   for (const WriteDescriptor &WD : D.Writes) {
-    RegID = WD.isImplicitWrite() ? WD.RegisterID
+    RegID = WD.isImplicitWrite() ? MCRegister(WD.RegisterID)
                                  : MCI.getOperand(WD.OpIndex).getReg();
     // Check if this is a optional definition that references NoReg or a write
     // to a constant register.
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index ff7129ba178cf..5fdf3baf8c02c 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -2369,7 +2369,7 @@ ResourceSectionRef::getContents(const coff_resource_data_entry &Entry) {
         Expected<StringRef> Contents = S.getContents();
         if (!Contents)
           return Contents.takeError();
-        return Contents->slice(Offset, Offset + Entry.DataSize);
+        return Contents->substr(Offset, Entry.DataSize);
       }
     }
     return createStringError(object_error::parse_failed,
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 6f3dd4d8b5180..8fa3f67ea00f3 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -2099,7 +2099,7 @@ ArrayRef<uint8_t> getSegmentContents(const MachOObjectFile &Obj,
   }
   auto &Segment = SegmentOrErr.get();
   return arrayRefFromStringRef(
-      Obj.getData().slice(Segment.fileoff, Segment.fileoff + Segment.filesize));
+      Obj.getData().substr(Segment.fileoff, Segment.filesize));
 }
 } // namespace
 
@@ -2454,9 +2454,8 @@ StringRef MachOObjectFile::guessLibraryShortName(StringRef Name,
     Idx = 0;
   else
     Idx = b+1;
-  F = Name.slice(Idx, Idx + Foo.size());
-  DotFramework = Name.slice(Idx + Foo.size(),
-                            Idx + Foo.size() + sizeof(".framework/")-1);
+  F = Name.substr(Idx, Foo.size());
+  DotFramework = Name.substr(Idx + Foo.size(), sizeof(".framework/") - 1);
   if (F == Foo && DotFramework == ".framework/") {
     isFramework = true;
     return Foo;
@@ -2476,9 +2475,8 @@ StringRef MachOObjectFile::guessLibraryShortName(StringRef Name,
     Idx = 0;
   else
     Idx = d+1;
-  F = Name.slice(Idx, Idx + Foo.size());
-  DotFramework = Name.slice(Idx + Foo.size(),
-                            Idx + Foo.size() + sizeof(".framework/")-1);
+  F = Name.substr(Idx, Foo.size());
+  DotFramework = Name.substr(Idx + Foo.size(), sizeof(".framework/") - 1);
   if (F == Foo && DotFramework == ".framework/") {
     isFramework = true;
     return Foo;
@@ -2495,7 +2493,7 @@ StringRef MachOObjectFile::guessLibraryShortName(StringRef Name,
 
   // First pull off the version letter for the form Foo.A.dylib if any.
   if (a >= 3) {
-    Dot = Name.slice(a-2, a-1);
+    Dot = Name.substr(a - 2, 1);
     if (Dot == ".")
       a = a - 2;
   }
@@ -2520,7 +2518,7 @@ StringRef MachOObjectFile::guessLibraryShortName(StringRef Name,
   // There are incorrect library names of the form:
   // libATS.A_profile.dylib so check for these.
   if (Lib.size() >= 3) {
-    Dot = Lib.slice(Lib.size()-2, Lib.size()-1);
+    Dot = Lib.substr(Lib.size() - 2, 1);
     if (Dot == ".")
       Lib = Lib.slice(0, Lib.size()-2);
   }
@@ -2537,7 +2535,7 @@ StringRef MachOObjectFile::guessLibraryShortName(StringRef Name,
     Lib = Name.slice(b+1, a);
   // There are library names of the form: QT.A.qtx so check for these.
   if (Lib.size() >= 3) {
-    Dot = Lib.slice(Lib.size()-2, Lib.size()-1);
+    Dot = Lib.substr(Lib.size() - 2, 1);
     if (Dot == ".")
       Lib = Lib.slice(0, Lib.size()-2);
   }
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index f92e9d3812513..35b642ac7f3a2 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -575,6 +575,45 @@ void Instruction::dumpOS(raw_ostream &OS) const {
 }
 #endif // NDEBUG
 
+FreezeInst *FreezeInst::create(Value *V, BBIterator WhereIt,
+                               BasicBlock *WhereBB, Context &Ctx,
+                               const Twine &Name) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  auto *LLVMI = cast<llvm::FreezeInst>(Builder.CreateFreeze(V->Val, Name));
+  return Ctx.createFreezeInst(LLVMI);
+}
+
+FenceInst *FenceInst::create(AtomicOrdering Ordering, BBIterator WhereIt,
+                             BasicBlock *WhereBB, Context &Ctx,
+                             SyncScope::ID SSID) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  llvm::FenceInst *LLVMI = Builder.CreateFence(Ordering, SSID);
+  return Ctx.createFenceInst(LLVMI);
+}
+
+void FenceInst::setOrdering(AtomicOrdering Ordering) {
+  Ctx.getTracker()
+      .emplaceIfTracking<
+          GenericSetter<&FenceInst::getOrdering, &FenceInst::setOrdering>>(
+          this);
+  cast<llvm::FenceInst>(Val)->setOrdering(Ordering);
+}
+
+void FenceInst::setSyncScopeID(SyncScope::ID SSID) {
+  Ctx.getTracker()
+      .emplaceIfTracking<GenericSetter<&FenceInst::getSyncScopeID,
+                                       &FenceInst::setSyncScopeID>>(this);
+  cast<llvm::FenceInst>(Val)->setSyncScopeID(SSID);
+}
+
 Value *SelectInst::createCommon(Value *Cond, Value *True, Value *False,
                                 const Twine &Name, IRBuilder<> &Builder,
                                 Context &Ctx) {
@@ -945,8 +984,8 @@ void InvokeInst::setUnwindDest(BasicBlock *BB) {
   setOperand(2, BB);
   assert(getUnwindDest() == BB && "LLVM IR uses a different operan index!");
 }
-Instruction *InvokeInst::getLandingPadInst() const {
-  return cast<Instruction>(
+LandingPadInst *InvokeInst::getLandingPadInst() const {
+  return cast<LandingPadInst>(
       Ctx.getValue(cast<llvm::InvokeInst>(Val)->getLandingPadInst()));
   ;
 }
@@ -1043,6 +1082,31 @@ BasicBlock *CallBrInst::getSuccessor(unsigned Idx) const {
       Ctx.getValue(cast<llvm::CallBrInst>(Val)->getSuccessor(Idx)));
 }
 
+LandingPadInst *LandingPadInst::create(Type *RetTy, unsigned NumReservedClauses,
+                                       BBIterator WhereIt, BasicBlock *WhereBB,
+                                       Context &Ctx, const Twine &Name) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  llvm::LandingPadInst *LLVMI =
+      Builder.CreateLandingPad(RetTy, NumReservedClauses, Name);
+  return Ctx.createLandingPadInst(LLVMI);
+}
+
+void LandingPadInst::setCleanup(bool V) {
+  Ctx.getTracker()
+      .emplaceIfTracking<GenericSetter<&LandingPadInst::isCleanup,
+                                       &LandingPadInst::setCleanup>>(this);
+  cast<llvm::LandingPadInst>(Val)->setCleanup(V);
+}
+
+Constant *LandingPadInst::getClause(unsigned Idx) const {
+  return cast<Constant>(
+      Ctx.getValue(cast<llvm::LandingPadInst>(Val)->getClause(Idx)));
+}
+
 Value *FuncletPadInst::getParentPad() const {
   return Ctx.getValue(cast<llvm::FuncletPadInst>(Val)->getParentPad());
 }
@@ -1666,6 +1730,14 @@ Value *BinaryOperator::createWithCopiedFlags(Instruction::Opcode Op, Value *LHS,
                                InsertAtEnd, Ctx, Name);
 }
 
+void PossiblyDisjointInst::setIsDisjoint(bool B) {
+  Ctx.getTracker()
+      .emplaceIfTracking<GenericSetter<&PossiblyDisjointInst::isDisjoint,
+                                       &PossiblyDisjointInst::setIsDisjoint>>(
+          this);
+  cast<llvm::PossiblyDisjointInst>(Val)->setIsDisjoint(B);
+}
+
 void AtomicRMWInst::setAlignment(Align Align) {
   Ctx.getTracker()
       .emplaceIfTracking<GenericSetter<&AtomicRMWInst::getAlign,
@@ -1919,6 +1991,13 @@ bool CastInst::classof(const Value *From) {
   return From->getSubclassID() == ClassID::Cast;
 }
 
+void PossiblyNonNegInst::setNonNeg(bool B) {
+  Ctx.getTracker()
+      .emplaceIfTracking<GenericSetter<&PossiblyNonNegInst::hasNonNeg,
+                                       &PossiblyNonNegInst::setNonNeg>>(this);
+  cast<llvm::PossiblyNonNegInst>(Val)->setNonNeg(B);
+}
+
 Value *InsertElementInst::create(Value *Vec, Value *NewElt, Value *Idx,
                                  Instruction *InsertBefore, Context &Ctx,
                                  const Twine &Name) {
@@ -2157,6 +2236,16 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
   assert(isa<llvm::Instruction>(LLVMV) && "Expected Instruction");
 
   switch (cast<llvm::Instruction>(LLVMV)->getOpcode()) {
+  case llvm::Instruction::Freeze: {
+    auto *LLVMFreeze = cast<llvm::FreezeInst>(LLVMV);
+    It->second = std::unique_ptr<FreezeInst>(new FreezeInst(LLVMFreeze, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Fence: {
+    auto *LLVMFence = cast<llvm::FenceInst>(LLVMV);
+    It->second = std::unique_ptr<FenceInst>(new FenceInst(LLVMFence, *this));
+    return It->second.get();
+  }
   case llvm::Instruction::Select: {
     auto *LLVMSel = cast<llvm::SelectInst>(LLVMV);
     It->second = std::unique_ptr<SelectInst>(new SelectInst(LLVMSel, *this));
@@ -2215,6 +2304,12 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     It->second = std::unique_ptr<CallBrInst>(new CallBrInst(LLVMCallBr, *this));
     return It->second.get();
   }
+  case llvm::Instruction::LandingPad: {
+    auto *LLVMLPad = cast<llvm::LandingPadInst>(LLVMV);
+    It->second =
+        std::unique_ptr<LandingPadInst>(new LandingPadInst(LLVMLPad, *this));
+    return It->second.get();
+  }
   case llvm::Instruction::CatchPad: {
     auto *LLVMCPI = cast<llvm::CatchPadInst>(LLVMV);
     It->second =
@@ -2349,6 +2444,16 @@ BasicBlock *Context::createBasicBlock(llvm::BasicBlock *LLVMBB) {
   return BB;
 }
 
+FreezeInst *Context::createFreezeInst(llvm::FreezeInst *SI) {
+  auto NewPtr = std::unique_ptr<FreezeInst>(new FreezeInst(SI, *this));
+  return cast<FreezeInst>(registerValue(std::move(NewPtr)));
+}
+
+FenceInst *Context::createFenceInst(llvm::FenceInst *SI) {
+  auto NewPtr = std::unique_ptr<FenceInst>(new FenceInst(SI, *this));
+  return cast<FenceInst>(registerValue(std::move(NewPtr)));
+}
+
 SelectInst *Context::createSelectInst(llvm::SelectInst *SI) {
   auto NewPtr = std::unique_ptr<SelectInst>(new SelectInst(SI, *this));
   return cast<SelectInst>(registerValue(std::move(NewPtr)));
@@ -2415,6 +2520,10 @@ UnreachableInst *Context::createUnreachableInst(llvm::UnreachableInst *UI) {
       std::unique_ptr<UnreachableInst>(new UnreachableInst(UI, *this));
   return cast<UnreachableInst>(registerValue(std::move(NewPtr)));
 }
+LandingPadInst *Context::createLandingPadInst(llvm::LandingPadInst *I) {
+  auto NewPtr = std::unique_ptr<LandingPadInst>(new LandingPadInst(I, *this));
+  return cast<LandingPadInst>(registerValue(std::move(NewPtr)));
+}
 CatchPadInst *Context::createCatchPadInst(llvm::CatchPadInst *I) {
   auto NewPtr = std::unique_ptr<CatchPadInst>(new CatchPadInst(I, *this));
   return cast<CatchPadInst>(registerValue(std::move(NewPtr)));
diff --git a/llvm/lib/TableGen/DetailedRecordsBackend.cpp b/llvm/lib/TableGen/DetailedRecordsBackend.cpp
index 500aa4c78225d..45e621483c817 100644
--- a/llvm/lib/TableGen/DetailedRecordsBackend.cpp
+++ b/llvm/lib/TableGen/DetailedRecordsBackend.cpp
@@ -21,36 +21,30 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include <map>
-#include <memory>
 #include <string>
 #include <utility>
 
-#define DEBUG_TYPE "detailed-records-backend"
-
-#define NL "\n"
-
 using namespace llvm;
 
 namespace {
 
 class DetailedRecordsEmitter {
 private:
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
 
 public:
-  DetailedRecordsEmitter(RecordKeeper &RK) : Records(RK) {}
+  explicit DetailedRecordsEmitter(const RecordKeeper &RK) : Records(RK) {}
 
   void run(raw_ostream &OS);
   void printReportHeading(raw_ostream &OS);
+  void printSectionHeading(StringRef Title, int Count, raw_ostream &OS);
   void printVariables(raw_ostream &OS);
   void printClasses(raw_ostream &OS);
   void printRecords(raw_ostream &OS);
-  void printSectionHeading(StringRef Title, int Count, raw_ostream &OS);
-  void printDefms(Record *Rec, raw_ostream &OS);
-  void printTemplateArgs(Record *Rec, raw_ostream &OS);
-  void printSuperclasses(Record *Rec, raw_ostream &OS);
-  void printFields(Record *Rec, raw_ostream &OS);
+  void printDefms(const Record &Rec, raw_ostream &OS);
+  void printTemplateArgs(const Record &Rec, raw_ostream &OS);
+  void printSuperclasses(const Record &Rec, raw_ostream &OS);
+  void printFields(const Record &Rec, raw_ostream &OS);
 }; // emitter class
 
 } // anonymous namespace
@@ -68,15 +62,21 @@ void DetailedRecordsEmitter::printReportHeading(raw_ostream &OS) {
   OS << formatv("DETAILED RECORDS for file {0}\n", Records.getInputFilename());
 }
 
+// Print a section heading with the name of the section and
+// the item count.
+void DetailedRecordsEmitter::printSectionHeading(StringRef Title, int Count,
+                                                 raw_ostream &OS) {
+  OS << formatv("\n{0} {1} ({2}) {0}\n", "--------------------", Title, Count);
+}
+
 // Print the global variables.
 void DetailedRecordsEmitter::printVariables(raw_ostream &OS) {
   const auto GlobalList = Records.getGlobals();
   printSectionHeading("Global Variables", GlobalList.size(), OS);
 
-  OS << NL;
-  for (const auto &Var : GlobalList) {
-    OS << Var.first << " = " << Var.second->getAsString() << NL;
-  }
+  OS << '\n';
+  for (const auto &Var : GlobalList)
+    OS << Var.first << " = " << Var.second->getAsString() << '\n';
 }
 
 // Print the classes, including the template arguments, superclasses,
@@ -85,13 +85,12 @@ void DetailedRecordsEmitter::printClasses(raw_ostream &OS) {
   const auto &ClassList = Records.getClasses();
   printSectionHeading("Classes", ClassList.size(), OS);
 
-  for (const auto &ClassPair : ClassList) {
-    auto *const Class = ClassPair.second.get();
+  for (const auto &[Name, Class] : ClassList) {
     OS << formatv("\n{0}  |{1}|\n", Class->getNameInitAsString(),
                   SrcMgr.getFormattedLocationNoOffset(Class->getLoc().front()));
-    printTemplateArgs(Class, OS);
-    printSuperclasses(Class, OS);
-    printFields(Class, OS);
+    printTemplateArgs(*Class, OS);
+    printSuperclasses(*Class, OS);
+    printFields(*Class, OS);
   }
 }
 
@@ -101,42 +100,33 @@ void DetailedRecordsEmitter::printRecords(raw_ostream &OS) {
   const auto &RecordList = Records.getDefs();
   printSectionHeading("Records", RecordList.size(), OS);
 
-  for (const auto &RecPair : RecordList) {
-    auto *const Rec = RecPair.second.get();
+  for (const auto &[DefName, Rec] : RecordList) {
     std::string Name = Rec->getNameInitAsString();
     OS << formatv("\n{0}  |{1}|\n", Name.empty() ? "\"\"" : Name,
                   SrcMgr.getFormattedLocationNoOffset(Rec->getLoc().front()));
-    printDefms(Rec, OS);
-    printSuperclasses(Rec, OS);
-    printFields(Rec, OS);
+    printDefms(*Rec, OS);
+    printSuperclasses(*Rec, OS);
+    printFields(*Rec, OS);
   }
 }
 
-// Print a section heading with the name of the section and
-// the item count.
-void DetailedRecordsEmitter::printSectionHeading(StringRef Title, int Count,
-                                                 raw_ostream &OS) {
-  OS << formatv("\n{0} {1} ({2}) {0}\n", "--------------------", Title, Count);
-}
-
 // Print the record's defm source locations, if any. Note that they
 // are stored in the reverse order of their invocation.
-void DetailedRecordsEmitter::printDefms(Record *Rec, raw_ostream &OS) {
-  const auto &LocList = Rec->getLoc();
+void DetailedRecordsEmitter::printDefms(const Record &Rec, raw_ostream &OS) {
+  const auto &LocList = Rec.getLoc();
   if (LocList.size() < 2)
     return;
 
   OS << "  Defm sequence:";
-  for (unsigned I = LocList.size() - 1; I >= 1; --I) {
-    OS << formatv(" |{0}|", SrcMgr.getFormattedLocationNoOffset(LocList[I]));
-  }
-  OS << NL;
+  for (const SMLoc Loc : reverse(LocList))
+    OS << formatv(" |{0}|", SrcMgr.getFormattedLocationNoOffset(Loc));
+  OS << '\n';
 }
 
 // Print the template arguments of a class.
-void DetailedRecordsEmitter::printTemplateArgs(Record *Rec,
+void DetailedRecordsEmitter::printTemplateArgs(const Record &Rec,
                                                raw_ostream &OS) {
-  ArrayRef<Init *> Args = Rec->getTemplateArgs();
+  ArrayRef<Init *> Args = Rec.getTemplateArgs();
   if (Args.empty()) {
     OS << "  Template args: (none)\n";
     return;
@@ -144,38 +134,38 @@ void DetailedRecordsEmitter::printTemplateArgs(Record *Rec,
 
   OS << "  Template args:\n";
   for (const Init *ArgName : Args) {
-    const RecordVal *Value = Rec->getValue(ArgName);
+    const RecordVal *Value = Rec.getValue(ArgName);
     assert(Value && "Template argument value not found.");
     OS << "    ";
     Value->print(OS, false);
-    OS << formatv("  |{0}|", SrcMgr.getFormattedLocationNoOffset(Value->getLoc()));
-    OS << NL;
+    OS << formatv("  |{0}|\n",
+                  SrcMgr.getFormattedLocationNoOffset(Value->getLoc()));
   }
 }
 
 // Print the superclasses of a class or record. Indirect superclasses
 // are enclosed in parentheses.
-void DetailedRecordsEmitter::printSuperclasses(Record *Rec, raw_ostream &OS) {
-  ArrayRef<std::pair<Record *, SMRange>> Superclasses = Rec->getSuperClasses();
+void DetailedRecordsEmitter::printSuperclasses(const Record &Rec,
+                                               raw_ostream &OS) {
+  ArrayRef<std::pair<Record *, SMRange>> Superclasses = Rec.getSuperClasses();
   if (Superclasses.empty()) {
     OS << "  Superclasses: (none)\n";
     return;
   }
 
   OS << "  Superclasses:";
-  for (const auto &SuperclassPair : Superclasses) {
-    auto *ClassRec = SuperclassPair.first;
-    if (Rec->hasDirectSuperClass(ClassRec))
+  for (const auto &[ClassRec, Loc] : Superclasses) {
+    if (Rec.hasDirectSuperClass(ClassRec))
       OS << formatv(" {0}", ClassRec->getNameInitAsString());
     else
       OS << formatv(" ({0})", ClassRec->getNameInitAsString());
   }
-  OS << NL;
+  OS << '\n';
 }
 
 // Print the fields of a class or record, including their source locations.
-void DetailedRecordsEmitter::printFields(Record *Rec, raw_ostream &OS) {
-  const auto &ValueList = Rec->getValues();
+void DetailedRecordsEmitter::printFields(const Record &Rec, raw_ostream &OS) {
+  const auto &ValueList = Rec.getValues();
   if (ValueList.empty()) {
     OS << "  Fields: (none)\n";
     return;
@@ -183,7 +173,7 @@ void DetailedRecordsEmitter::printFields(Record *Rec, raw_ostream &OS) {
 
   OS << "  Fields:\n";
   for (const RecordVal &Value : ValueList)
-    if (!Rec->isTemplateArg(Value.getNameInit())) {
+    if (!Rec.isTemplateArg(Value.getNameInit())) {
       OS << "    ";
       Value.print(OS, false);
       OS << formatv("  |{0}|\n",
@@ -191,13 +181,8 @@ void DetailedRecordsEmitter::printFields(Record *Rec, raw_ostream &OS) {
     }
 }
 
-namespace llvm {
-
 // This function is called by TableGen after parsing the files.
-
-void EmitDetailedRecords(RecordKeeper &RK, raw_ostream &OS) {
+void llvm::EmitDetailedRecords(const RecordKeeper &RK, raw_ostream &OS) {
   // Instantiate the emitter class and invoke run().
   DetailedRecordsEmitter(RK).run(OS);
 }
-
-} // namespace llvm
diff --git a/llvm/lib/TableGen/JSONBackend.cpp b/llvm/lib/TableGen/JSONBackend.cpp
index cd10c22094e45..d648019ac46e8 100644
--- a/llvm/lib/TableGen/JSONBackend.cpp
+++ b/llvm/lib/TableGen/JSONBackend.cpp
@@ -26,43 +26,41 @@ namespace {
 
 class JSONEmitter {
 private:
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
 
   json::Value translateInit(const Init &I);
 
 public:
-  JSONEmitter(RecordKeeper &R);
+  explicit JSONEmitter(const RecordKeeper &R) : Records(R) {}
 
   void run(raw_ostream &OS);
 };
 
 } // end anonymous namespace
 
-JSONEmitter::JSONEmitter(RecordKeeper &R) : Records(R) {}
-
 json::Value JSONEmitter::translateInit(const Init &I) {
-
   // Init subclasses that we return as JSON primitive values of one
   // kind or another.
 
-  if (isa<UnsetInit>(&I)) {
+  if (isa<UnsetInit>(&I))
     return nullptr;
-  } else if (auto *Bit = dyn_cast<BitInit>(&I)) {
+  if (const auto *Bit = dyn_cast<BitInit>(&I))
     return Bit->getValue() ? 1 : 0;
-  } else if (auto *Bits = dyn_cast<BitsInit>(&I)) {
-    json::Array array;
-    for (unsigned i = 0, limit = Bits->getNumBits(); i < limit; i++)
-      array.push_back(translateInit(*Bits->getBit(i)));
-    return std::move(array);
-  } else if (auto *Int = dyn_cast<IntInit>(&I)) {
+  if (const auto *Bits = dyn_cast<BitsInit>(&I)) {
+    json::Array Array;
+    for (unsigned Idx = 0, E = Bits->getNumBits(); Idx < E; ++Idx)
+      Array.push_back(translateInit(*Bits->getBit(Idx)));
+    return std::move(Array);
+  }
+  if (const auto *Int = dyn_cast<IntInit>(&I))
     return Int->getValue();
-  } else if (auto *Str = dyn_cast<StringInit>(&I)) {
+  if (const auto *Str = dyn_cast<StringInit>(&I))
     return Str->getValue();
-  } else if (auto *List = dyn_cast<ListInit>(&I)) {
-    json::Array array;
-    for (auto *val : *List)
-      array.push_back(translateInit(*val));
-    return std::move(array);
+  if (const auto *List = dyn_cast<ListInit>(&I)) {
+    json::Array Array;
+    for (const auto *Val : *List)
+      Array.push_back(translateInit(*Val));
+    return std::move(Array);
   }
 
   // Init subclasses that we return as JSON objects containing a
@@ -70,56 +68,58 @@ json::Value JSONEmitter::translateInit(const Init &I) {
   // translation back into TableGen input syntax that -print-records
   // would give.
 
-  json::Object obj;
-  obj["printable"] = I.getAsString();
-
-  if (auto *Def = dyn_cast<DefInit>(&I)) {
-    obj["kind"] = "def";
-    obj["def"] = Def->getDef()->getName();
-    return std::move(obj);
-  } else if (auto *Var = dyn_cast<VarInit>(&I)) {
-    obj["kind"] = "var";
-    obj["var"] = Var->getName();
-    return std::move(obj);
-  } else if (auto *VarBit = dyn_cast<VarBitInit>(&I)) {
-    if (auto *Var = dyn_cast<VarInit>(VarBit->getBitVar())) {
-      obj["kind"] = "varbit";
-      obj["var"] = Var->getName();
-      obj["index"] = VarBit->getBitNum();
-      return std::move(obj);
+  json::Object Obj;
+  Obj["printable"] = I.getAsString();
+
+  if (const auto *Def = dyn_cast<DefInit>(&I)) {
+    Obj["kind"] = "def";
+    Obj["def"] = Def->getDef()->getName();
+    return std::move(Obj);
+  }
+  if (const auto *Var = dyn_cast<VarInit>(&I)) {
+    Obj["kind"] = "var";
+    Obj["var"] = Var->getName();
+    return std::move(Obj);
+  }
+  if (const auto *VarBit = dyn_cast<VarBitInit>(&I)) {
+    if (const auto *Var = dyn_cast<VarInit>(VarBit->getBitVar())) {
+      Obj["kind"] = "varbit";
+      Obj["var"] = Var->getName();
+      Obj["index"] = VarBit->getBitNum();
+      return std::move(Obj);
     }
-  } else if (auto *Dag = dyn_cast<DagInit>(&I)) {
-    obj["kind"] = "dag";
-    obj["operator"] = translateInit(*Dag->getOperator());
+  }
+  if (const auto *Dag = dyn_cast<DagInit>(&I)) {
+    Obj["kind"] = "dag";
+    Obj["operator"] = translateInit(*Dag->getOperator());
     if (auto name = Dag->getName())
-      obj["name"] = name->getAsUnquotedString();
-    json::Array args;
-    for (unsigned i = 0, limit = Dag->getNumArgs(); i < limit; ++i) {
-      json::Array arg;
-      arg.push_back(translateInit(*Dag->getArg(i)));
-      if (auto argname = Dag->getArgName(i))
-        arg.push_back(argname->getAsUnquotedString());
+      Obj["name"] = name->getAsUnquotedString();
+    json::Array Args;
+    for (unsigned Idx = 0, E = Dag->getNumArgs(); Idx < E; ++Idx) {
+      json::Array Arg;
+      Arg.push_back(translateInit(*Dag->getArg(Idx)));
+      if (const auto ArgName = Dag->getArgName(Idx))
+        Arg.push_back(ArgName->getAsUnquotedString());
       else
-        arg.push_back(nullptr);
-      args.push_back(std::move(arg));
+        Arg.push_back(nullptr);
+      Args.push_back(std::move(Arg));
     }
-    obj["args"] = std::move(args);
-    return std::move(obj);
+    Obj["args"] = std::move(Args);
+    return std::move(Obj);
   }
 
   // Final fallback: anything that gets past here is simply given a
   // kind field of 'complex', and the only other field is the standard
   // 'printable' representation.
-
   assert(!I.isConcrete());
-  obj["kind"] = "complex";
-  return std::move(obj);
+  Obj["kind"] = "complex";
+  return std::move(Obj);
 }
 
 void JSONEmitter::run(raw_ostream &OS) {
-  json::Object root;
+  json::Object Root;
 
-  root["!tablegen_json_version"] = 1;
+  Root["!tablegen_json_version"] = 1;
 
   // Prepare the arrays that will list the instances of every class.
   // We mostly fill those in by iterating over the superclasses of
@@ -127,64 +127,59 @@ void JSONEmitter::run(raw_ostream &OS) {
   // class with no instances at all, so we do a preliminary iteration
   // over the classes, invoking std::map::operator[] to default-
   // construct the array for each one.
-  std::map<std::string, json::Array> instance_lists;
-  for (const auto &C : Records.getClasses()) {
-    const auto Name = C.second->getNameInitAsString();
-    (void)instance_lists[Name];
-  }
+  std::map<std::string, json::Array> InstanceLists;
+  for (const auto &[ClassName, ClassRec] : Records.getClasses())
+    InstanceLists.emplace(ClassRec->getNameInitAsString(), json::Array());
 
   // Main iteration over the defs.
-  for (const auto &D : Records.getDefs()) {
-    const auto Name = D.second->getNameInitAsString();
-    auto &Def = *D.second;
+  for (const auto &[DefName, Def] : Records.getDefs()) {
+    const std::string Name = Def->getNameInitAsString();
 
-    json::Object obj;
-    json::Array fields;
+    json::Object Obj;
+    json::Array Fields;
 
-    for (const RecordVal &RV : Def.getValues()) {
-      if (!Def.isTemplateArg(RV.getNameInit())) {
+    for (const RecordVal &RV : Def->getValues()) {
+      if (!Def->isTemplateArg(RV.getNameInit())) {
         auto Name = RV.getNameInitAsString();
         if (RV.isNonconcreteOK())
-          fields.push_back(Name);
-        obj[Name] = translateInit(*RV.getValue());
+          Fields.push_back(Name);
+        Obj[Name] = translateInit(*RV.getValue());
       }
     }
 
-    obj["!fields"] = std::move(fields);
+    Obj["!fields"] = std::move(Fields);
 
-    json::Array superclasses;
-    for (const auto &SuperPair : Def.getSuperClasses())
-      superclasses.push_back(SuperPair.first->getNameInitAsString());
-    obj["!superclasses"] = std::move(superclasses);
+    json::Array SuperClasses;
+    // Add this def to the instance list for each of its superclasses.
+    for (const auto &[SuperClass, Loc] : Def->getSuperClasses()) {
+      std::string SuperName = SuperClass->getNameInitAsString();
+      SuperClasses.push_back(SuperName);
+      InstanceLists[SuperName].push_back(Name);
+    }
 
-    obj["!name"] = Name;
-    obj["!anonymous"] = Def.isAnonymous();
+    Obj["!superclasses"] = std::move(SuperClasses);
 
-    json::Array locs;
-    for (const SMLoc Loc : Def.getLoc())
-      locs.push_back(SrcMgr.getFormattedLocationNoOffset(Loc));
-    obj["!locs"] = std::move(locs);
+    Obj["!name"] = Name;
+    Obj["!anonymous"] = Def->isAnonymous();
 
-    root[Name] = std::move(obj);
+    json::Array Locs;
+    for (const SMLoc Loc : Def->getLoc())
+      Locs.push_back(SrcMgr.getFormattedLocationNoOffset(Loc));
+    Obj["!locs"] = std::move(Locs);
 
-    // Add this def to the instance list for each of its superclasses.
-    for (const auto &SuperPair : Def.getSuperClasses()) {
-      auto SuperName = SuperPair.first->getNameInitAsString();
-      instance_lists[SuperName].push_back(Name);
-    }
+    Root[Name] = std::move(Obj);
   }
 
   // Make a JSON object from the std::map of instance lists.
-  json::Object instanceof;
-  for (auto kv: instance_lists)
-    instanceof[kv.first] = std::move(kv.second);
-  root["!instanceof"] = std::move(instanceof);
+  json::Object InstanceOf;
+  for (auto &[ClassName, Instances] : InstanceLists)
+    InstanceOf[ClassName] = std::move(Instances);
+  Root["!instanceof"] = std::move(InstanceOf);
 
   // Done. Write the output.
-  OS << json::Value(std::move(root)) << "\n";
+  OS << json::Value(std::move(Root)) << "\n";
 }
 
-namespace llvm {
-
-void EmitJSON(RecordKeeper &RK, raw_ostream &OS) { JSONEmitter(RK).run(OS); }
-} // end namespace llvm
+void llvm::EmitJSON(const RecordKeeper &RK, raw_ostream &OS) {
+  JSONEmitter(RK).run(OS);
+}
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 841fa7c3f3690..a4c41223c0762 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -131,13 +131,10 @@ int llvm::TableGenMain(const char *argv0,
   std::string OutString;
   raw_string_ostream Out(OutString);
   unsigned status = 0;
-  TableGen::Emitter::FnT ActionFn = TableGen::Emitter::Action->getValue();
-  if (ActionFn)
-    ActionFn(Records, Out);
-  else if (MainFn)
-    status = MainFn(Out, Records);
-  else
-    return 1;
+  // ApplyCallback will return true if it did not apply any callback. In that
+  // case, attempt to apply the MainFn.
+  if (TableGen::Emitter::ApplyCallback(Records, Out))
+    status = MainFn ? MainFn(Out, Records) : 1;
   Records.stopBackendTimer();
   if (status)
     return 1;
diff --git a/llvm/lib/TableGen/TableGenBackend.cpp b/llvm/lib/TableGen/TableGenBackend.cpp
index 035abe936e114..210fff6545862 100644
--- a/llvm/lib/TableGen/TableGenBackend.cpp
+++ b/llvm/lib/TableGen/TableGenBackend.cpp
@@ -12,6 +12,8 @@
 
 #include "llvm/TableGen/TableGenBackend.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -19,15 +21,53 @@
 #include <cstddef>
 
 using namespace llvm;
+using namespace TableGen::Emitter;
 
 const size_t MAX_LINE_LEN = 80U;
 
-namespace llvm::TableGen::Emitter {
-ManagedStatic<cl::opt<FnT>, OptCreatorT> Action;
-void *OptCreatorT::call() {
-  return new cl::opt<FnT>(cl::desc("Action to perform:"));
+// CommandLine options of class type are not directly supported with some
+// specific exceptions like std::string which are safe to copy. In our case,
+// the `FnT` function_ref object is also safe to copy. So provide a
+// specialization of `OptionValue` for `FnT` type that stores it as a copy.
+// This is essentially similar to OptionValue<std::string> specialization for
+// strings.
+template <> struct cl::OptionValue<FnT> final : cl::OptionValueCopy<FnT> {
+  OptionValue() = default;
+
+  OptionValue(const FnT &V) { this->setValue(V); }
+
+  OptionValue<FnT> &operator=(const FnT &V) {
+    setValue(V);
+    return *this;
+  }
+};
+
+namespace {
+struct OptCreatorT {
+  static void *call() {
+    return new cl::opt<FnT>(cl::desc("Action to perform:"));
+  }
+};
+} // namespace
+
+static ManagedStatic<cl::opt<FnT>, OptCreatorT> CallbackFunction;
+
+Opt::Opt(StringRef Name, FnT CB, StringRef Desc, bool ByDefault) {
+  if (ByDefault)
+    CallbackFunction->setInitialValue(CB);
+  CallbackFunction->getParser().addLiteralOption(Name, CB, Desc);
+}
+
+/// Apply callback specified on the command line. Returns true if no callback
+/// was applied.
+bool llvm::TableGen::Emitter::ApplyCallback(RecordKeeper &Records,
+                                            raw_ostream &OS) {
+  FnT Fn = CallbackFunction->getValue();
+  if (!Fn)
+    return true;
+  Fn(Records, OS);
+  return false;
 }
-} // namespace llvm::TableGen::Emitter
 
 static void printLine(raw_ostream &OS, const Twine &Prefix, char Fill,
                       StringRef Suffix) {
@@ -59,7 +99,7 @@ void llvm::emitSourceFileHeader(StringRef Desc, raw_ostream &OS,
   printLine(OS, Prefix + "Automatically generated file, do not edit!", ' ',
             Suffix);
 
-  // Print the filename of source file
+  // Print the filename of source file.
   if (!Record.getInputFilename().empty())
     printLine(
         OS, Prefix + "From: " + sys::path::filename(Record.getInputFilename()),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5ac5b7f8a5ab1..215f30128e703 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1496,7 +1496,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::AVGCEILU, VT, Custom);
 
       if (!Subtarget->isLittleEndian())
-        setOperationAction(ISD::BITCAST, VT, Expand);
+        setOperationAction(ISD::BITCAST, VT, Custom);
 
       if (Subtarget->hasSVE2() ||
           (Subtarget->hasSME() && Subtarget->isStreaming()))
@@ -1510,9 +1510,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
     }
 
-    // Legalize unpacked bitcasts to REINTERPRET_CAST.
-    for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
-                    MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
+    // Type legalize unpacked bitcasts.
+    for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
       setOperationAction(ISD::BITCAST, VT, Custom);
 
     for (auto VT :
@@ -1587,6 +1586,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
                     MVT::nxv4f32, MVT::nxv2f64}) {
+      setOperationAction(ISD::BITCAST, VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
@@ -1658,20 +1658,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setCondCodeAction(ISD::SETUGT, VT, Expand);
       setCondCodeAction(ISD::SETUEQ, VT, Expand);
       setCondCodeAction(ISD::SETONE, VT, Expand);
-
-      if (!Subtarget->isLittleEndian())
-        setOperationAction(ISD::BITCAST, VT, Expand);
     }
 
     for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
+      setOperationAction(ISD::BITCAST, VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
-
-      if (!Subtarget->isLittleEndian())
-        setOperationAction(ISD::BITCAST, VT, Expand);
     }
 
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
@@ -4962,22 +4957,35 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
     return LowerFixedLengthBitcastToSVE(Op, DAG);
 
   if (OpVT.isScalableVector()) {
-    // Bitcasting between unpacked vector types of different element counts is
-    // not a NOP because the live elements are laid out differently.
-    //                01234567
-    // e.g. nxv2i32 = XX??XX??
-    //      nxv4f16 = X?X?X?X?
-    if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
-      return SDValue();
+    assert(isTypeLegal(OpVT) && "Unexpected result type!");
 
-    if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
+    // Handle type legalisation first.
+    if (!isTypeLegal(ArgVT)) {
       assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
              "Expected int->fp bitcast!");
+
+      // Bitcasting between unpacked vector types of different element counts is
+      // not a NOP because the live elements are laid out differently.
+      //                01234567
+      // e.g. nxv2i32 = XX??XX??
+      //      nxv4f16 = X?X?X?X?
+      if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
+        return SDValue();
+
       SDValue ExtResult =
           DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
                       Op.getOperand(0));
       return getSVESafeBitCast(OpVT, ExtResult, DAG);
     }
+
+    // Bitcasts between legal types with the same element count are legal.
+    if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
+      return Op;
+
+    // getSVESafeBitCast does not support casting between unpacked types.
+    if (!isPackedVectorType(OpVT, DAG))
+      return SDValue();
+
     return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
   }
 
@@ -28906,7 +28914,22 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
   if (InVT != PackedInVT)
     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
 
-  Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
+  if (Subtarget->isLittleEndian() ||
+      PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
+    Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
+  else {
+    EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
+    EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
+
+    // Simulate the effect of casting through memory.
+    Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
+    if (PackedInVTAsInt.getScalarSizeInBits() != 8)
+      Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
+    Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
+    if (PackedVTAsInt.getScalarSizeInBits() != 8)
+      Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
+    Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
+  }
 
   // Unpack result if required.
   if (VT != PackedVT)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 2d41aff605a54..9fab886743241 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4851,7 +4851,9 @@ void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I,
                                    const DebugLoc &DL, MCRegister DestReg,
-                                   MCRegister SrcReg, bool KillSrc) const {
+                                   MCRegister SrcReg, bool KillSrc,
+                                   bool RenamableDest,
+                                   bool RenamableSrc) const {
   if (AArch64::GPR32spRegClass.contains(DestReg) &&
       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
     const TargetRegisterInfo *TRI = &getRegisterInfo();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 1580cdc0a2ba2..dbd820afb3be0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -344,7 +344,8 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                        llvm::ArrayRef<unsigned> Indices) const;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3f4651ea9c2b6..694b7fb2068a2 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2650,113 +2650,62 @@ let Predicates = [HasSVEorSME] in {
                                sub_32)>;
   }
 
-  // FIXME: BigEndian requires an additional REV instruction to satisfy the
-  // constraint that none of the bits change when stored to memory as one
-  // type, and reloaded as another type.
-  let Predicates = [IsLE] in {
-    def : Pat<(nxv16i8 (bitconvert nxv8i16:$src)), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert nxv4i32:$src)), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert nxv2i64:$src)), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert nxv8f16:$src)), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert nxv4f32:$src)), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv16i8 (bitconvert nxv2f64:$src)), (nxv16i8 ZPR:$src)>;
-
-    def : Pat<(nxv8i16 (bitconvert nxv16i8:$src)), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert nxv4i32:$src)), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert nxv2i64:$src)), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert nxv8f16:$src)), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert nxv4f32:$src)), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert nxv2f64:$src)), (nxv8i16 ZPR:$src)>;
-
-    def : Pat<(nxv4i32 (bitconvert nxv16i8:$src)), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert nxv8i16:$src)), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert nxv2i64:$src)), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert nxv8f16:$src)), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert nxv4f32:$src)), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert nxv2f64:$src)), (nxv4i32 ZPR:$src)>;
-
-    def : Pat<(nxv2i64 (bitconvert nxv16i8:$src)), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert nxv8i16:$src)), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert nxv4i32:$src)), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert nxv8f16:$src)), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert nxv4f32:$src)), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert nxv2f64:$src)), (nxv2i64 ZPR:$src)>;
-
-    def : Pat<(nxv8f16 (bitconvert nxv16i8:$src)), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert nxv8i16:$src)), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert nxv4i32:$src)), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert nxv2i64:$src)), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert nxv4f32:$src)), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert nxv2f64:$src)), (nxv8f16 ZPR:$src)>;
-
-    def : Pat<(nxv4f32 (bitconvert nxv16i8:$src)), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert nxv8i16:$src)), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert nxv4i32:$src)), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert nxv2i64:$src)), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert nxv8f16:$src)), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert nxv2f64:$src)), (nxv4f32 ZPR:$src)>;
-
-    def : Pat<(nxv2f64 (bitconvert nxv16i8:$src)), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert nxv8i16:$src)), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert nxv4i32:$src)), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert nxv2i64:$src)), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert nxv8f16:$src)), (nxv2f64 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert nxv4f32:$src)), (nxv2f64 ZPR:$src)>;
-
-    def : Pat<(nxv8bf16 (bitconvert nxv16i8:$src)), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert nxv8i16:$src)), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert nxv4i32:$src)), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert nxv2i64:$src)), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert nxv8f16:$src)), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert nxv4f32:$src)), (nxv8bf16 ZPR:$src)>;
-    def : Pat<(nxv8bf16 (bitconvert nxv2f64:$src)), (nxv8bf16 ZPR:$src)>;
-
-    def : Pat<(nxv16i8 (bitconvert nxv8bf16:$src)), (nxv16i8 ZPR:$src)>;
-    def : Pat<(nxv8i16 (bitconvert nxv8bf16:$src)), (nxv8i16 ZPR:$src)>;
-    def : Pat<(nxv4i32 (bitconvert nxv8bf16:$src)), (nxv4i32 ZPR:$src)>;
-    def : Pat<(nxv2i64 (bitconvert nxv8bf16:$src)), (nxv2i64 ZPR:$src)>;
-    def : Pat<(nxv8f16 (bitconvert nxv8bf16:$src)), (nxv8f16 ZPR:$src)>;
-    def : Pat<(nxv4f32 (bitconvert nxv8bf16:$src)), (nxv4f32 ZPR:$src)>;
-    def : Pat<(nxv2f64 (bitconvert nxv8bf16:$src)), (nxv2f64 ZPR:$src)>;
-
-    def : Pat<(nxv16i1 (bitconvert aarch64svcount:$src)), (nxv16i1 PPR:$src)>;
-    def : Pat<(aarch64svcount (bitconvert nxv16i1:$src)), (aarch64svcount PNR:$src)>;
-  }
-
-  // These allow casting from/to unpacked predicate types.
-  def : Pat<(nxv16i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv16i1 (reinterpret_cast nxv8i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv16i1 (reinterpret_cast nxv4i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv16i1 (reinterpret_cast nxv2i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv16i1 (reinterpret_cast nxv1i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv8i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv8i1 (reinterpret_cast  nxv4i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv8i1 (reinterpret_cast  nxv2i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv8i1 (reinterpret_cast  nxv1i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv4i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv4i1 (reinterpret_cast  nxv8i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv4i1 (reinterpret_cast  nxv2i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv4i1 (reinterpret_cast  nxv1i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv2i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv2i1 (reinterpret_cast  nxv8i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv2i1 (reinterpret_cast  nxv4i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv2i1 (reinterpret_cast  nxv1i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv1i1 (reinterpret_cast nxv16i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv1i1 (reinterpret_cast  nxv8i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv1i1 (reinterpret_cast  nxv4i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-  def : Pat<(nxv1i1 (reinterpret_cast  nxv2i1:$src)), (COPY_TO_REGCLASS PPR:$src, PPR)>;
-
-  // These allow casting from/to unpacked floating-point types.
-  def : Pat<(nxv2f16 (reinterpret_cast nxv8f16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv8f16 (reinterpret_cast nxv2f16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv4f16 (reinterpret_cast nxv8f16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv8f16 (reinterpret_cast nxv4f16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv2f32 (reinterpret_cast nxv4f32:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv4f32 (reinterpret_cast nxv2f32:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv2bf16 (reinterpret_cast nxv8bf16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv8bf16 (reinterpret_cast nxv2bf16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv4bf16 (reinterpret_cast nxv8bf16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
-  def : Pat<(nxv8bf16 (reinterpret_cast nxv4bf16:$src)), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  // For big endian, only BITCASTs involving same sized vector types with same
+  // size vector elements can be isel'd directly.
+  let Predicates = [IsLE] in
+    foreach VT = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in
+      foreach VT2 = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in
+        if !ne(VT,VT2) then
+          def : Pat<(VT (bitconvert (VT2 ZPR:$src))), (VT ZPR:$src)>;
+
+  def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+  def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+
+  def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+  def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+
+  def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+  def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
+  def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+  def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+
+  def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+  def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+
+  def : Pat<(nxv4bf16 (bitconvert (nxv4f16 ZPR:$src))), (nxv4bf16 ZPR:$src)>;
+  def : Pat<(nxv4f16 (bitconvert (nxv4bf16 ZPR:$src))), (nxv4f16 ZPR:$src)>;
+
+  def : Pat<(nxv2bf16 (bitconvert (nxv2f16 ZPR:$src))), (nxv2bf16 ZPR:$src)>;
+  def : Pat<(nxv2f16 (bitconvert (nxv2bf16 ZPR:$src))), (nxv2f16 ZPR:$src)>;
+
+  def : Pat<(nxv16i1 (bitconvert (aarch64svcount PNR:$src))), (nxv16i1 PPR:$src)>;
+  def : Pat<(aarch64svcount (bitconvert (nxv16i1 PPR:$src))), (aarch64svcount PNR:$src)>;
+
+  // These allow nop casting between predicate vector types.
+  foreach VT = [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ] in
+    foreach VT2 = [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ] in
+      def : Pat<(VT (reinterpret_cast (VT2 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+
+  // These allow nop casting between half vector types.
+  foreach VT = [ nxv2f16, nxv4f16, nxv8f16 ] in
+    foreach VT2 = [ nxv2f16, nxv4f16, nxv8f16 ] in
+      def : Pat<(VT (reinterpret_cast (VT2 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+
+  // These allow nop casting between float vector types.
+  foreach VT = [ nxv2f32, nxv4f32 ] in
+    foreach VT2 = [ nxv2f32, nxv4f32 ] in
+      def : Pat<(VT (reinterpret_cast (VT2 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+
+  // These allow nop casting between bfloat vector types.
+  foreach VT = [ nxv2bf16, nxv4bf16, nxv8bf16 ] in
+    foreach VT2 = [ nxv2bf16, nxv4bf16, nxv8bf16 ] in
+      def : Pat<(VT (reinterpret_cast (VT2 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+
+  // These allow nop casting between all packed vector types.
+  foreach VT = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in
+    foreach VT2 = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in
+      def : Pat<(VT (AArch64NvCast (VT2 ZPR:$src))), (VT ZPR:$src)>;
 
   def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
             (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 1a10206eea237..3914f36338fa5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -3815,7 +3815,7 @@ bool AMDGPUAsmParser::validateVOPDRegBankConstraints(
     const MCOperand &Opr = Inst.getOperand(OperandIdx);
     return (Opr.isReg() && !isSGPR(mc2PseudoReg(Opr.getReg()), TRI))
                ? Opr.getReg()
-               : MCRegister::NoRegister;
+               : MCRegister();
   };
 
   // On GFX12 if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 source-cache.
@@ -4753,7 +4753,7 @@ static int IsAGPROperand(const MCInst &Inst, uint16_t NameIdx,
   if (!Op.isReg())
     return -1;
 
-  unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+  MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
   auto Reg = Sub ? Sub : Op.getReg();
   const MCRegisterClass &AGPR32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID);
   return AGPR32.contains(Reg) ? 1 : 0;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 1a0dc7098347a..b1da9da19c69b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -382,7 +382,7 @@ static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
   if (!Op.isReg())
     return false;
 
-  unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
+  MCRegister Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0);
   auto Reg = Sub ? Sub : Op.getReg();
   return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
 }
diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index de3c06f3a71e2..0fa8d4847931a 100644
--- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -180,11 +180,8 @@ class R600EmitClauseMarkers : public MachineFunctionPass {
                         MachineBasicBlock::iterator BBEnd) {
     const R600RegisterInfo &TRI = TII->getRegisterInfo();
     //TODO: change this to defs?
-    for (MachineInstr::const_mop_iterator
-           MOI = Def->operands_begin(),
-           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
-      if (!MOI->isReg() || !MOI->isDef() ||
-          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
+    for (MachineOperand &MO : Def->all_defs()) {
+      if (TRI.isPhysRegLiveAcrossClauses(MO.getReg()))
         continue;
 
       // Def defines a clause local register, so check that its use will fit
@@ -208,11 +205,11 @@ class R600EmitClauseMarkers : public MachineFunctionPass {
         // occur in the same basic block as its definition, because
         // it is illegal for the scheduler to schedule them in
         // different blocks.
-        if (UseI->readsRegister(MOI->getReg(), &TRI))
+        if (UseI->readsRegister(MO.getReg(), &TRI))
           LastUseCount = AluInstCount;
 
         // Exit early if the current use kills the register
-        if (UseI != Def && UseI->killsRegister(MOI->getReg(), &TRI))
+        if (UseI != Def && UseI->killsRegister(MO.getReg(), &TRI))
           break;
       }
       if (LastUseCount)
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index ff44454339268..d826ae8c5da23 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -40,7 +40,8 @@ bool R600InstrInfo::isVector(const MachineInstr &MI) const {
 void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI,
                                 const DebugLoc &DL, MCRegister DestReg,
-                                MCRegister SrcReg, bool KillSrc) const {
+                                MCRegister SrcReg, bool KillSrc,
+                                bool RenamableDest, bool RenamableSrc) const {
   unsigned VectorComponents = 0;
   if ((R600::R600_Reg128RegClass.contains(DestReg) ||
       R600::R600_Reg128VerticalRegClass.contains(DestReg)) &&
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index f720e4656348c..c767ecb24590b 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -73,7 +73,8 @@ class R600InstrInfo final : public R600GenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index eded8063feaaa..a2ce8ee361040 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -350,13 +350,9 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
   Register DestReg = MI->getOperand(DstIndex).getReg();
   // PressureRegister crashes if an operand is def and used in the same inst
   // and we try to constraint its regclass
-  for (MachineInstr::mop_iterator It = MI->operands_begin(),
-      E = MI->operands_end(); It != E; ++It) {
-    MachineOperand &MO = *It;
-    if (MO.isReg() && !MO.isDef() &&
-        MO.getReg() == DestReg)
+  for (const MachineOperand &MO : MI->all_uses())
+    if (MO.getReg() == DestReg)
       return;
-  }
   // Constrains the regclass of DestReg to assign it to Slot
   switch (Slot) {
   case 0:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index eafe20be17d5b..8ae7f2910ec5a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -883,10 +883,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         // can be used as the actual source after export patching, so
         // we need to treat them like sources and set the EXP_CNT
         // score.
-        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
-          MachineOperand &DefMO = Inst.getOperand(I);
-          if (DefMO.isReg() && DefMO.isDef() &&
-              TRI->isVGPR(*MRI, DefMO.getReg())) {
+        for (MachineOperand &DefMO : Inst.all_defs()) {
+          if (TRI->isVGPR(*MRI, DefMO.getReg())) {
             setRegScore(
                 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
                 EXP_CNT, CurrScore);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 84d25a1fbd272..a857bdba53c3e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -794,7 +794,8 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const DebugLoc &DL, MCRegister DestReg,
-                              MCRegister SrcReg, bool KillSrc) const {
+                              MCRegister SrcReg, bool KillSrc,
+                              bool RenamableDest, bool RenamableSrc) const {
   const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
   unsigned Size = RI.getRegSizeInBits(*RC);
   const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index badfd91c0b972..4fd9b4366159b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -255,7 +255,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void materializeImmediate(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI, const DebugLoc &DL,
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index 9b5e45cb5fe97..78db68fca3050 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -281,7 +281,8 @@ unsigned ARCInstrInfo::removeBranch(MachineBasicBlock &MBB,
 void ARCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
                                const DebugLoc &DL, MCRegister DestReg,
-                               MCRegister SrcReg, bool KillSrc) const {
+                               MCRegister SrcReg, bool KillSrc,
+                               bool RenamableDest, bool RenamableSrc) const {
   assert(ARC::GPR32RegClass.contains(SrcReg) &&
          "Only GPR32 src copy supported.");
   assert(ARC::GPR32RegClass.contains(DestReg) &&
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.h b/llvm/lib/Target/ARC/ARCInstrInfo.h
index 1875aafbde826..e25f990252260 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.h
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.h
@@ -65,7 +65,8 @@ class ARCInstrInfo : public ARCGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, Register SrcReg,
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 1199052ca97e9..49513fe10945a 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -892,7 +892,9 @@ void llvm::addPredicatedMveVpredROp(MachineInstrBuilder &MIB,
 void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I,
                                    const DebugLoc &DL, MCRegister DestReg,
-                                   MCRegister SrcReg, bool KillSrc) const {
+                                   MCRegister SrcReg, bool KillSrc,
+                                   bool RenamableDest,
+                                   bool RenamableSrc) const {
   bool GPRDest = ARM::GPRRegClass.contains(DestReg);
   bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
 
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 8521e3ef91399..9e4e12a9a7441 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -209,7 +209,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp
index 4878c73138940..2ab66da4b4d2d 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.cpp
+++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp
@@ -24,7 +24,7 @@ static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
   static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
   // Try to get the first register.
-  if (unsigned Reg = State.AllocateReg(RegList))
+  if (MCRegister Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else {
     // For the 2nd half of a v2f64, do not fail.
@@ -38,7 +38,7 @@ static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
   }
 
   // Try to get the second register.
-  if (unsigned Reg = State.AllocateReg(RegList))
+  if (MCRegister Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
     State.addLoc(CCValAssign::getCustomMem(
@@ -67,8 +67,8 @@ static bool f64AssignAAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
   static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
   static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
-  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList);
-  if (Reg == 0) {
+  MCRegister Reg = State.AllocateReg(HiRegList, ShadowRegList);
+  if (!Reg) {
 
     // If we had R3 unallocated only, now we still must to waste it.
     Reg = State.AllocateReg(GPRArgRegs);
@@ -89,7 +89,7 @@ static bool f64AssignAAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
     if (HiRegList[i] == Reg)
       break;
 
-  unsigned T = State.AllocateReg(LoRegList[i]);
+  MCRegister T = State.AllocateReg(LoRegList[i]);
   (void)T;
   assert(T == LoRegList[i] && "Could not allocate register");
 
@@ -116,8 +116,8 @@ static bool f64RetAssign(unsigned ValNo, MVT ValVT, MVT LocVT,
   static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
   static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
 
-  unsigned Reg = State.AllocateReg(HiRegList, LoRegList);
-  if (Reg == 0)
+  MCRegister Reg = State.AllocateReg(HiRegList, LoRegList);
+  if (!Reg)
     return false; // we didn't handle it
 
   unsigned i;
@@ -287,7 +287,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
 static bool CustomAssignInRegList(unsigned ValNo, MVT ValVT, MVT LocVT,
                                   CCValAssign::LocInfo LocInfo, CCState &State,
                                   ArrayRef<MCPhysReg> RegList) {
-  unsigned Reg = State.AllocateReg(RegList);
+  MCRegister Reg = State.AllocateReg(RegList);
   if (Reg) {
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return true;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 4ab0433069ae6..853f54943eebf 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2915,7 +2915,7 @@ void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
   // Byval (as with any stack) slots are always at least 4 byte aligned.
   Alignment = std::max(Alignment, Align(4));
 
-  unsigned Reg = State->AllocateReg(GPRArgRegs);
+  MCRegister Reg = State->AllocateReg(GPRArgRegs);
   if (!Reg)
     return;
 
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 396328e958d18..a38aa3de40d90 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -42,7 +42,8 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
 void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, MCRegister DestReg,
-                                  MCRegister SrcReg, bool KillSrc) const {
+                                  MCRegister SrcReg, bool KillSrc,
+                                  bool RenamableDest, bool RenamableSrc) const {
   // Need to check the arch.
   MachineFunction &MF = *MBB.getParent();
   const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>();
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 984bec4e64490..84241fb8a9a66 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -39,7 +39,8 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
                            bool isKill, int FrameIndex,
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 09bcd3109f2b3..d1e07b6703a5e 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -151,7 +151,8 @@ Thumb2InstrInfo::optimizeSelect(MachineInstr &MI,
 void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, MCRegister DestReg,
-                                  MCRegister SrcReg, bool KillSrc) const {
+                                  MCRegister SrcReg, bool KillSrc,
+                                  bool RenamableDest, bool RenamableSrc) const {
   // Handle SPR, DPR, and QPR copies.
   if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
     return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 8915da8c5bf3c..70ee3270e64ac 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -39,7 +39,8 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 18b7365fc5aa0..7b0f8d74e77c2 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -42,7 +42,8 @@ AVRInstrInfo::AVRInstrInfo(AVRSubtarget &STI)
 void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI,
                                const DebugLoc &DL, MCRegister DestReg,
-                               MCRegister SrcReg, bool KillSrc) const {
+                               MCRegister SrcReg, bool KillSrc,
+                               bool RenamableDest, bool RenamableSrc) const {
   const AVRRegisterInfo &TRI = *STI.getRegisterInfo();
   unsigned Opc;
 
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h
index 28c0e0319d46e..8eb4292f2422d 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -75,7 +75,8 @@ class AVRInstrInfo : public AVRGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, Register SrcReg,
                            bool isKill, int FrameIndex,
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 2209f1f1462b4..1b07e7ffc0d31 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -31,7 +31,8 @@ BPFInstrInfo::BPFInstrInfo()
 void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
                                const DebugLoc &DL, MCRegister DestReg,
-                               MCRegister SrcReg, bool KillSrc) const {
+                               MCRegister SrcReg, bool KillSrc,
+                               bool RenamableDest, bool RenamableSrc) const {
   if (BPF::GPRRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(BPF::MOV_rr), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc));
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h
index 354aca1bd2f93..a6b6fd7dc4d96 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -31,7 +31,8 @@ class BPFInstrInfo : public BPFGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index 6baca84ab3d0a..a2bb87bcaaf94 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -478,7 +478,8 @@ void CSKYInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I,
                                 const DebugLoc &DL, MCRegister DestReg,
-                                MCRegister SrcReg, bool KillSrc) const {
+                                MCRegister SrcReg, bool KillSrc,
+                                bool RenamableDest, bool RenamableSrc) const {
   if (CSKY::GPRRegClass.contains(SrcReg) &&
       CSKY::CARRYRegClass.contains(DestReg)) {
     if (STI.hasE2()) {
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
index 4e3866b1188ca..54c1106310d85 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -55,7 +55,8 @@ class CSKYInstrInfo : public CSKYGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index e49169cff8aa8..2daa4f825c3b2 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -38,6 +38,7 @@ static bool isIntrinsicExpansion(Function &F) {
   case Intrinsic::log:
   case Intrinsic::log10:
   case Intrinsic::pow:
+  case Intrinsic::dx_all:
   case Intrinsic::dx_any:
   case Intrinsic::dx_clamp:
   case Intrinsic::dx_uclamp:
@@ -54,8 +55,7 @@ static bool isIntrinsicExpansion(Function &F) {
 
 static Value *expandAbs(CallInst *Orig) {
   Value *X = Orig->getOperand(0);
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
   Type *Ty = X->getType();
   Type *EltTy = Ty->getScalarType();
   Constant *Zero = Ty->isVectorTy()
@@ -148,8 +148,7 @@ static Value *expandIntegerDotIntrinsic(CallInst *Orig,
 
 static Value *expandExpIntrinsic(CallInst *Orig) {
   Value *X = Orig->getOperand(0);
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
   Type *Ty = X->getType();
   Type *EltTy = Ty->getScalarType();
   Constant *Log2eConst =
@@ -166,13 +165,21 @@ static Value *expandExpIntrinsic(CallInst *Orig) {
   return Exp2Call;
 }
 
-static Value *expandAnyIntrinsic(CallInst *Orig) {
+static Value *expandAnyOrAllIntrinsic(CallInst *Orig,
+                                      Intrinsic::ID intrinsicId) {
   Value *X = Orig->getOperand(0);
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
   Type *Ty = X->getType();
   Type *EltTy = Ty->getScalarType();
 
+  auto ApplyOp = [&Builder](Intrinsic::ID IntrinsicId, Value *Result,
+                            Value *Elt) {
+    if (IntrinsicId == Intrinsic::dx_any)
+      return Builder.CreateOr(Result, Elt);
+    assert(IntrinsicId == Intrinsic::dx_all);
+    return Builder.CreateAnd(Result, Elt);
+  };
+
   Value *Result = nullptr;
   if (!Ty->isVectorTy()) {
     Result = EltTy->isFloatingPointTy()
@@ -193,7 +200,7 @@ static Value *expandAnyIntrinsic(CallInst *Orig) {
     Result = Builder.CreateExtractElement(Cond, (uint64_t)0);
     for (unsigned I = 1; I < XVec->getNumElements(); I++) {
       Value *Elt = Builder.CreateExtractElement(Cond, I);
-      Result = Builder.CreateOr(Result, Elt);
+      Result = ApplyOp(intrinsicId, Result, Elt);
     }
   }
   return Result;
@@ -201,8 +208,7 @@ static Value *expandAnyIntrinsic(CallInst *Orig) {
 
 static Value *expandLengthIntrinsic(CallInst *Orig) {
   Value *X = Orig->getOperand(0);
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
   Type *Ty = X->getType();
   Type *EltTy = Ty->getScalarType();
 
@@ -230,8 +236,7 @@ static Value *expandLerpIntrinsic(CallInst *Orig) {
   Value *X = Orig->getOperand(0);
   Value *Y = Orig->getOperand(1);
   Value *S = Orig->getOperand(2);
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
   auto *V = Builder.CreateFSub(Y, X);
   V = Builder.CreateFMul(S, V);
   return Builder.CreateFAdd(X, V, "dx.lerp");
@@ -240,8 +245,7 @@ static Value *expandLerpIntrinsic(CallInst *Orig) {
 static Value *expandLogIntrinsic(CallInst *Orig,
                                  float LogConstVal = numbers::ln2f) {
   Value *X = Orig->getOperand(0);
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
   Type *Ty = X->getType();
   Type *EltTy = Ty->getScalarType();
   Constant *Ln2Const =
@@ -266,8 +270,7 @@ static Value *expandNormalizeIntrinsic(CallInst *Orig) {
   Value *X = Orig->getOperand(0);
   Type *Ty = Orig->getType();
   Type *EltTy = Ty->getScalarType();
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
 
   auto *XVec = dyn_cast<FixedVectorType>(Ty);
   if (!XVec) {
@@ -305,8 +308,7 @@ static Value *expandPowIntrinsic(CallInst *Orig) {
   Value *X = Orig->getOperand(0);
   Value *Y = Orig->getOperand(1);
   Type *Ty = X->getType();
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
 
   auto *Log2Call =
       Builder.CreateIntrinsic(Ty, Intrinsic::log2, {X}, nullptr, "elt.log2");
@@ -350,8 +352,7 @@ static Value *expandClampIntrinsic(CallInst *Orig,
   Value *Min = Orig->getOperand(1);
   Value *Max = Orig->getOperand(2);
   Type *Ty = X->getType();
-  IRBuilder<> Builder(Orig->getParent());
-  Builder.SetInsertPoint(Orig);
+  IRBuilder<> Builder(Orig);
   auto *MaxCall = Builder.CreateIntrinsic(
       Ty, getMaxForClamp(Ty, ClampIntrinsic), {X, Min}, nullptr, "dx.max");
   return Builder.CreateIntrinsic(Ty, getMinForClamp(Ty, ClampIntrinsic),
@@ -360,7 +361,8 @@ static Value *expandClampIntrinsic(CallInst *Orig,
 
 static bool expandIntrinsic(Function &F, CallInst *Orig) {
   Value *Result = nullptr;
-  switch (F.getIntrinsicID()) {
+  Intrinsic::ID IntrinsicId = F.getIntrinsicID();
+  switch (IntrinsicId) {
   case Intrinsic::abs:
     Result = expandAbs(Orig);
     break;
@@ -376,12 +378,13 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::pow:
     Result = expandPowIntrinsic(Orig);
     break;
+  case Intrinsic::dx_all:
   case Intrinsic::dx_any:
-    Result = expandAnyIntrinsic(Orig);
+    Result = expandAnyOrAllIntrinsic(Orig, IntrinsicId);
     break;
   case Intrinsic::dx_uclamp:
   case Intrinsic::dx_clamp:
-    Result = expandClampIntrinsic(Orig, F.getIntrinsicID());
+    Result = expandClampIntrinsic(Orig, IntrinsicId);
     break;
   case Intrinsic::dx_lerp:
     Result = expandLerpIntrinsic(Orig);
@@ -397,7 +400,7 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
     break;
   case Intrinsic::dx_sdot:
   case Intrinsic::dx_udot:
-    Result = expandIntegerDotIntrinsic(Orig, F.getIntrinsicID());
+    Result = expandIntegerDotIntrinsic(Orig, IntrinsicId);
     break;
   }
 
diff --git a/llvm/lib/Target/DirectX/DXILMetadata.cpp b/llvm/lib/Target/DirectX/DXILMetadata.cpp
index ed0434ac98a18..1f5759c363013 100644
--- a/llvm/lib/Target/DirectX/DXILMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILMetadata.cpp
@@ -284,6 +284,11 @@ void dxil::createEntryMD(Module &M, const uint64_t ShaderFlags) {
     EntryList.emplace_back(&F);
   }
 
+  // If there are no entries, do nothing. This is mostly to allow for writing
+  // tests with no actual entry functions.
+  if (EntryList.empty())
+    return;
+
   auto &Ctx = M.getContext();
   // FIXME: generate metadata for resource.
   // See https://github.com/llvm/llvm-project/issues/57926.
diff --git a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
index 7d2abb7078b8a..f282eff6c002b 100644
--- a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
@@ -10,21 +10,248 @@
 #include "DXILResourceAnalysis.h"
 #include "DirectX.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DXILResource.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-static void prettyPrintResources(raw_ostream &OS,
+static StringRef getRCName(dxil::ResourceClass RC) {
+  switch (RC) {
+  case dxil::ResourceClass::SRV:
+    return "SRV";
+  case dxil::ResourceClass::UAV:
+    return "UAV";
+  case dxil::ResourceClass::CBuffer:
+    return "cbuffer";
+  case dxil::ResourceClass::Sampler:
+    return "sampler";
+  }
+  llvm_unreachable("covered switch");
+}
+
+static StringRef getRCPrefix(dxil::ResourceClass RC) {
+  switch (RC) {
+  case dxil::ResourceClass::SRV:
+    return "t";
+  case dxil::ResourceClass::UAV:
+    return "u";
+  case dxil::ResourceClass::CBuffer:
+    return "cb";
+  case dxil::ResourceClass::Sampler:
+    return "s";
+  }
+}
+
+static StringRef getFormatName(const dxil::ResourceInfo &RI) {
+  if (RI.isTyped()) {
+    switch (RI.getTyped().ElementTy) {
+    case dxil::ElementType::I1:
+      return "i1";
+    case dxil::ElementType::I16:
+      return "i16";
+    case dxil::ElementType::U16:
+      return "u16";
+    case dxil::ElementType::I32:
+      return "i32";
+    case dxil::ElementType::U32:
+      return "u32";
+    case dxil::ElementType::I64:
+      return "i64";
+    case dxil::ElementType::U64:
+      return "u64";
+    case dxil::ElementType::F16:
+      return "f16";
+    case dxil::ElementType::F32:
+      return "f32";
+    case dxil::ElementType::F64:
+      return "f64";
+    case dxil::ElementType::SNormF16:
+      return "snorm_f16";
+    case dxil::ElementType::UNormF16:
+      return "unorm_f16";
+    case dxil::ElementType::SNormF32:
+      return "snorm_f32";
+    case dxil::ElementType::UNormF32:
+      return "unorm_f32";
+    case dxil::ElementType::SNormF64:
+      return "snorm_f64";
+    case dxil::ElementType::UNormF64:
+      return "unorm_f64";
+    case dxil::ElementType::PackedS8x32:
+      return "p32i8";
+    case dxil::ElementType::PackedU8x32:
+      return "p32u8";
+    case dxil::ElementType::Invalid:
+      llvm_unreachable("Invalid ElementType");
+    }
+    llvm_unreachable("Unhandled ElementType");
+  } else if (RI.isStruct())
+    return "struct";
+  else if (RI.isCBuffer() || RI.isSampler())
+    return "NA";
+  return "byte";
+}
+
+static StringRef getTextureDimName(dxil::ResourceKind RK) {
+  switch (RK) {
+  case dxil::ResourceKind::Texture1D:
+    return "1d";
+  case dxil::ResourceKind::Texture2D:
+    return "2d";
+  case dxil::ResourceKind::Texture3D:
+    return "3d";
+  case dxil::ResourceKind::TextureCube:
+    return "cube";
+  case dxil::ResourceKind::Texture1DArray:
+    return "1darray";
+  case dxil::ResourceKind::Texture2DArray:
+    return "2darray";
+  case dxil::ResourceKind::TextureCubeArray:
+    return "cubearray";
+  case dxil::ResourceKind::TBuffer:
+    return "tbuffer";
+  case dxil::ResourceKind::FeedbackTexture2D:
+    return "fbtex2d";
+  case dxil::ResourceKind::FeedbackTexture2DArray:
+    return "fbtex2darray";
+  case dxil::ResourceKind::Texture2DMS:
+    return "2dMS";
+  case dxil::ResourceKind::Texture2DMSArray:
+    return "2darrayMS";
+  case dxil::ResourceKind::Invalid:
+  case dxil::ResourceKind::NumEntries:
+  case dxil::ResourceKind::CBuffer:
+  case dxil::ResourceKind::RawBuffer:
+  case dxil::ResourceKind::Sampler:
+  case dxil::ResourceKind::StructuredBuffer:
+  case dxil::ResourceKind::TypedBuffer:
+  case dxil::ResourceKind::RTAccelerationStructure:
+    llvm_unreachable("Invalid ResourceKind for texture");
+  }
+  llvm_unreachable("Unhandled ResourceKind");
+}
+
+namespace {
+struct FormatResourceDimension
+    : public llvm::FormatAdapter<const dxil::ResourceInfo &> {
+  explicit FormatResourceDimension(const dxil::ResourceInfo &RI)
+      : llvm::FormatAdapter<const dxil::ResourceInfo &>(RI) {}
+
+  void format(llvm::raw_ostream &OS, StringRef Style) override {
+    dxil::ResourceKind RK = Item.getResourceKind();
+    switch (RK) {
+    default: {
+      OS << getTextureDimName(RK);
+      if (Item.isMultiSample())
+        OS << Item.getMultiSample().Count;
+      break;
+    }
+    case dxil::ResourceKind::RawBuffer:
+    case dxil::ResourceKind::StructuredBuffer:
+      if (!Item.isUAV())
+        OS << "r/o";
+      else if (Item.getUAV().HasCounter)
+        OS << "r/w+cnt";
+      else
+        OS << "r/w";
+      break;
+    case dxil::ResourceKind::TypedBuffer:
+      OS << "buf";
+      break;
+    case dxil::ResourceKind::RTAccelerationStructure:
+      // TODO: dxc would print "ras" here. Can/should this happen?
+      llvm_unreachable("RTAccelerationStructure printing is not implemented");
+    }
+  }
+};
+
+struct FormatBindingID
+    : public llvm::FormatAdapter<const dxil::ResourceInfo &> {
+  explicit FormatBindingID(const dxil::ResourceInfo &RI)
+      : llvm::FormatAdapter<const dxil::ResourceInfo &>(RI) {}
+
+  void format(llvm::raw_ostream &OS, StringRef Style) override {
+    OS << getRCPrefix(Item.getResourceClass()).upper()
+       << Item.getBinding().RecordID;
+  }
+};
+
+struct FormatBindingLocation
+    : public llvm::FormatAdapter<const dxil::ResourceInfo &> {
+  explicit FormatBindingLocation(const dxil::ResourceInfo &RI)
+      : llvm::FormatAdapter<const dxil::ResourceInfo &>(RI) {}
+
+  void format(llvm::raw_ostream &OS, StringRef Style) override {
+    const auto &Binding = Item.getBinding();
+    OS << getRCPrefix(Item.getResourceClass()) << Binding.LowerBound;
+    if (Binding.Space)
+      OS << ",space" << Binding.Space;
+  }
+};
+
+struct FormatBindingSize
+    : public llvm::FormatAdapter<const dxil::ResourceInfo &> {
+  explicit FormatBindingSize(const dxil::ResourceInfo &RI)
+      : llvm::FormatAdapter<const dxil::ResourceInfo &>(RI) {}
+
+  void format(llvm::raw_ostream &OS, StringRef Style) override {
+    uint32_t Size = Item.getBinding().Size;
+    if (Size == std::numeric_limits<uint32_t>::max())
+      OS << "unbounded";
+    else
+      OS << Size;
+  }
+};
+
+} // namespace
+
+static void prettyPrintResources(raw_ostream &OS, const DXILResourceMap &DRM,
                                  const dxil::Resources &MDResources) {
-  MDResources.print(OS);
+  // Column widths are arbitrary but match the widths DXC uses.
+  OS << ";\n; Resource Bindings:\n;\n";
+  OS << formatv("; {0,-30} {1,10} {2,7} {3,11} {4,7} {5,14} {6,9}\n", "Name",
+                "Type", "Format", "Dim", "ID", "HLSL Bind", "Count");
+  OS << formatv(
+      "; {0,-+30} {1,-+10} {2,-+7} {3,-+11} {4,-+7} {5,-+14} {6,-+9}\n", "", "",
+      "", "", "", "", "");
+
+  // TODO: Do we want to sort these by binding or something like that?
+  for (const dxil::ResourceInfo &RI : DRM) {
+    dxil::ResourceClass RC = RI.getResourceClass();
+    assert((RC != dxil::ResourceClass::CBuffer || !MDResources.hasCBuffers()) &&
+           "Old and new cbuffer representations can't coexist");
+    assert((RC != dxil::ResourceClass::UAV || !MDResources.hasUAVs()) &&
+           "Old and new UAV representations can't coexist");
+
+    StringRef Name(RI.getName());
+    StringRef Type(getRCName(RC));
+    StringRef Format(getFormatName(RI));
+    FormatResourceDimension Dim(RI);
+    FormatBindingID ID(RI);
+    FormatBindingLocation Bind(RI);
+    FormatBindingSize Count(RI);
+    OS << formatv("; {0,-30} {1,10} {2,7} {3,11} {4,7} {5,14} {6,9}\n", Name,
+                  Type, Format, Dim, ID, Bind, Count);
+  }
+
+  if (MDResources.hasCBuffers())
+    MDResources.printCBuffers(OS);
+  if (MDResources.hasUAVs())
+    MDResources.printUAVs(OS);
+
+  OS << ";\n";
 }
 
 PreservedAnalyses DXILPrettyPrinterPass::run(Module &M,
                                              ModuleAnalysisManager &MAM) {
+  const DXILResourceMap &DRM = MAM.getResult<DXILResourceAnalysis>(M);
   const dxil::Resources &MDResources = MAM.getResult<DXILResourceMDAnalysis>(M);
-  prettyPrintResources(OS, MDResources);
+  prettyPrintResources(OS, DRM, MDResources);
   return PreservedAnalyses::all();
 }
 
@@ -49,6 +276,7 @@ class DXILPrettyPrinterLegacy : public llvm::ModulePass {
   bool runOnModule(Module &M) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
+    AU.addRequired<DXILResourceWrapperPass>();
     AU.addRequired<DXILResourceMDWrapper>();
   }
 };
@@ -57,13 +285,16 @@ class DXILPrettyPrinterLegacy : public llvm::ModulePass {
 char DXILPrettyPrinterLegacy::ID = 0;
 INITIALIZE_PASS_BEGIN(DXILPrettyPrinterLegacy, "dxil-pretty-printer",
                       "DXIL Metadata Pretty Printer", true, true)
+INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DXILResourceMDWrapper)
 INITIALIZE_PASS_END(DXILPrettyPrinterLegacy, "dxil-pretty-printer",
                     "DXIL Metadata Pretty Printer", true, true)
 
 bool DXILPrettyPrinterLegacy::runOnModule(Module &M) {
+  const DXILResourceMap &DRM =
+      getAnalysis<DXILResourceWrapperPass>().getResourceMap();
   dxil::Resources &Res = getAnalysis<DXILResourceMDWrapper>().getDXILResource();
-  Res.print(OS);
+  prettyPrintResources(OS, DRM, Res);
   return false;
 }
 
diff --git a/llvm/lib/Target/DirectX/DXILResource.cpp b/llvm/lib/Target/DirectX/DXILResource.cpp
index 8e5b9867e6661..f027283b70521 100644
--- a/llvm/lib/Target/DirectX/DXILResource.cpp
+++ b/llvm/lib/Target/DirectX/DXILResource.cpp
@@ -333,37 +333,14 @@ template <typename T> MDNode *ResourceTable<T>::write(Module &M) const {
   return MDNode::get(M.getContext(), MDs);
 }
 
-void Resources::write(Module &M) const {
-  Metadata *ResourceMDs[4] = {nullptr, nullptr, nullptr, nullptr};
-
-  ResourceMDs[1] = UAVs.write(M);
-
-  ResourceMDs[2] = CBuffers.write(M);
-
-  bool HasResource = ResourceMDs[0] != nullptr || ResourceMDs[1] != nullptr ||
-                     ResourceMDs[2] != nullptr || ResourceMDs[3] != nullptr;
-
-  if (HasResource) {
-    NamedMDNode *DXResMD = M.getOrInsertNamedMetadata("dx.resources");
-    DXResMD->addOperand(MDNode::get(M.getContext(), ResourceMDs));
-  }
-
-  NamedMDNode *Entry = M.getNamedMetadata("hlsl.uavs");
-  if (Entry)
-    Entry->eraseFromParent();
+Metadata *Resources::writeUAVs(Module &M) const { return UAVs.write(M); }
+void Resources::printUAVs(raw_ostream &OS) const { UAVs.print(OS); }
+Metadata *Resources::writeCBuffers(Module &M) const {
+  return CBuffers.write(M);
 }
+void Resources::printCBuffers(raw_ostream &OS) const { CBuffers.print(OS); }
 
-void Resources::print(raw_ostream &O) const {
-  O << ";\n"
-    << "; Resource Bindings:\n"
-    << ";\n"
-    << "; Name                                 Type  Format         Dim      "
-       "ID      HLSL Bind  Count\n"
-    << "; ------------------------------ ---------- ------- ----------- "
-       "------- -------------- ------\n";
-
-  CBuffers.print(O);
-  UAVs.print(O);
+void Resources::dump() const {
+  printCBuffers(dbgs());
+  printUAVs(dbgs());
 }
-
-void Resources::dump() const { print(dbgs()); }
diff --git a/llvm/lib/Target/DirectX/DXILResource.h b/llvm/lib/Target/DirectX/DXILResource.h
index 06902fe2b87b0..812729bc4dc57 100644
--- a/llvm/lib/Target/DirectX/DXILResource.h
+++ b/llvm/lib/Target/DirectX/DXILResource.h
@@ -103,6 +103,7 @@ template <typename T> class ResourceTable {
 public:
   ResourceTable(StringRef Name) : MDName(Name) {}
   void collect(Module &M);
+  bool empty() const { return Data.empty(); }
   MDNode *write(Module &M) const;
   void print(raw_ostream &O) const;
 };
@@ -117,8 +118,12 @@ class Resources {
 
 public:
   void collect(Module &M);
-  void write(Module &M) const;
-  void print(raw_ostream &O) const;
+  bool hasUAVs() const { return !UAVs.empty(); }
+  Metadata *writeUAVs(Module &M) const;
+  void printUAVs(raw_ostream &OS) const;
+  bool hasCBuffers() const { return !CBuffers.empty(); }
+  Metadata *writeCBuffers(Module &M) const;
+  void printCBuffers(raw_ostream &OS) const;
   LLVM_DUMP_METHOD void dump() const;
 };
 
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 55b3d94568426..2c6d20112060d 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -13,16 +13,59 @@
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/DXILResource.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
 using namespace llvm::dxil;
 
-static void translateMetadata(Module &M, const dxil::Resources &MDResources,
+static void emitResourceMetadata(Module &M, const DXILResourceMap &DRM,
+                                 const dxil::Resources &MDResources) {
+  LLVMContext &Context = M.getContext();
+
+  SmallVector<Metadata *> SRVs, UAVs, CBufs, Smps;
+  for (const ResourceInfo &RI : DRM.srvs())
+    SRVs.push_back(RI.getAsMetadata(Context));
+  for (const ResourceInfo &RI : DRM.uavs())
+    UAVs.push_back(RI.getAsMetadata(Context));
+  for (const ResourceInfo &RI : DRM.cbuffers())
+    CBufs.push_back(RI.getAsMetadata(Context));
+  for (const ResourceInfo &RI : DRM.samplers())
+    Smps.push_back(RI.getAsMetadata(Context));
+
+  Metadata *SRVMD = SRVs.empty() ? nullptr : MDNode::get(Context, SRVs);
+  Metadata *UAVMD = UAVs.empty() ? nullptr : MDNode::get(Context, UAVs);
+  Metadata *CBufMD = CBufs.empty() ? nullptr : MDNode::get(Context, CBufs);
+  Metadata *SmpMD = Smps.empty() ? nullptr : MDNode::get(Context, Smps);
+  bool HasResources = !DRM.empty();
+
+  if (MDResources.hasUAVs()) {
+    assert(!UAVMD && "Old and new UAV representations can't coexist");
+    UAVMD = MDResources.writeUAVs(M);
+    HasResources = true;
+  }
+
+  if (MDResources.hasCBuffers()) {
+    assert(!CBufMD && "Old and new cbuffer representations can't coexist");
+    CBufMD = MDResources.writeCBuffers(M);
+    HasResources = true;
+  }
+
+  if (!HasResources)
+    return;
+
+  NamedMDNode *ResourceMD = M.getOrInsertNamedMetadata("dx.resources");
+  ResourceMD->addOperand(
+      MDNode::get(M.getContext(), {SRVMD, UAVMD, CBufMD, SmpMD}));
+}
+
+static void translateMetadata(Module &M, const DXILResourceMap &DRM,
+                              const dxil::Resources &MDResources,
                               const ComputedShaderFlags &ShaderFlags) {
   dxil::ValidatorVersionMD ValVerMD(M);
   if (ValVerMD.isEmpty())
@@ -30,18 +73,19 @@ static void translateMetadata(Module &M, const dxil::Resources &MDResources,
   dxil::createShaderModelMD(M);
   dxil::createDXILVersionMD(M);
 
-  MDResources.write(M);
+  emitResourceMetadata(M, DRM, MDResources);
 
   dxil::createEntryMD(M, static_cast<uint64_t>(ShaderFlags));
 }
 
 PreservedAnalyses DXILTranslateMetadata::run(Module &M,
                                              ModuleAnalysisManager &MAM) {
+  const DXILResourceMap &DRM = MAM.getResult<DXILResourceAnalysis>(M);
   const dxil::Resources &MDResources = MAM.getResult<DXILResourceMDAnalysis>(M);
   const ComputedShaderFlags &ShaderFlags =
       MAM.getResult<ShaderFlagsAnalysis>(M);
 
-  translateMetadata(M, MDResources, ShaderFlags);
+  translateMetadata(M, DRM, MDResources, ShaderFlags);
 
   return PreservedAnalyses::all();
 }
@@ -56,17 +100,20 @@ class DXILTranslateMetadataLegacy : public ModulePass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
+    AU.addRequired<DXILResourceWrapperPass>();
     AU.addRequired<DXILResourceMDWrapper>();
     AU.addRequired<ShaderFlagsAnalysisWrapper>();
   }
 
   bool runOnModule(Module &M) override {
+    const DXILResourceMap &DRM =
+        getAnalysis<DXILResourceWrapperPass>().getResourceMap();
     const dxil::Resources &MDResources =
         getAnalysis<DXILResourceMDWrapper>().getDXILResource();
     const ComputedShaderFlags &ShaderFlags =
         getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags();
 
-    translateMetadata(M, MDResources, ShaderFlags);
+    translateMetadata(M, DRM, MDResources, ShaderFlags);
     return true;
   }
 };
@@ -81,6 +128,7 @@ ModulePass *llvm::createDXILTranslateMetadataLegacyPass() {
 
 INITIALIZE_PASS_BEGIN(DXILTranslateMetadataLegacy, "dxil-translate-metadata",
                       "DXIL Translate Metadata", false, false)
+INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DXILResourceMDWrapper)
 INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper)
 INITIALIZE_PASS_END(DXILTranslateMetadataLegacy, "dxil-translate-metadata",
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 3f6de365fe393..7c77bf2b31b8e 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -857,7 +857,9 @@ static void getLiveOutRegsAt(LivePhysRegs &Regs, const MachineInstr &MI) {
 void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I,
                                    const DebugLoc &DL, MCRegister DestReg,
-                                   MCRegister SrcReg, bool KillSrc) const {
+                                   MCRegister SrcReg, bool KillSrc,
+                                   bool RenamableDest,
+                                   bool RenamableSrc) const {
   const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
   unsigned KillFlag = getKillRegState(KillSrc);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 4efc62fd717c6..854c3694ceba7 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -174,7 +174,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   /// large registers. See for example the ARM target.
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   /// Store the specified register of the given register class to the specified
   /// stack frame index. The store instruction is to be added to the given
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 9f3d9b7aaa6f9..06ef5d114f455 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -35,8 +35,8 @@ void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator Position,
                                  const DebugLoc &DL,
                                  MCRegister DestinationRegister,
-                                 MCRegister SourceRegister,
-                                 bool KillSource) const {
+                                 MCRegister SourceRegister, bool KillSource,
+                                 bool RenamableDest, bool RenamableSrc) const {
   if (!Lanai::GPRRegClass.contains(DestinationRegister, SourceRegister)) {
     llvm_unreachable("Impossible reg-to-reg copy");
   }
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index 8ad2b9237c928..2630464f0a76f 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -49,7 +49,9 @@ class LanaiInstrInfo : public LanaiGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
                    const DebugLoc &DL, MCRegister DestinationRegister,
-                   MCRegister SourceRegister, bool KillSource) const override;
+                   MCRegister SourceRegister, bool KillSource,
+                   bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator Position,
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index f52e188f87792..c2ae4a0734b6a 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -1314,8 +1314,8 @@ void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
   // expands to:
   //   pcaddu18i $rj, %call36(sym)
   //   jirl      $r0, $rj, 0
-  unsigned ScratchReg =
-      IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1;
+  MCRegister ScratchReg =
+      IsTailCall ? Inst.getOperand(0).getReg() : MCRegister(LoongArch::R1);
   const MCExpr *Sym =
       IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
   const LoongArchMCExpr *LE = LoongArchMCExpr::create(
@@ -1326,7 +1326,7 @@ void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
       getSTI());
   Out.emitInstruction(
       MCInstBuilder(LoongArch::JIRL)
-          .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg)
+          .addReg(IsTailCall ? MCRegister(LoongArch::R0) : ScratchReg)
           .addReg(ScratchReg)
           .addImm(0),
       getSTI());
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 50c6c263e966b..95c1b150722f6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5012,7 +5012,7 @@ static bool CC_LoongArch_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
         LoongArch::R23, LoongArch::R24, LoongArch::R25,
         LoongArch::R26, LoongArch::R27, LoongArch::R28,
         LoongArch::R29, LoongArch::R30, LoongArch::R31};
-    if (unsigned Reg = State.AllocateReg(GPRList)) {
+    if (MCRegister Reg = State.AllocateReg(GPRList)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -5023,7 +5023,7 @@ static bool CC_LoongArch_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
     //                        fs0,fs1,fs2,fs3
     static const MCPhysReg FPR32List[] = {LoongArch::F24, LoongArch::F25,
                                           LoongArch::F26, LoongArch::F27};
-    if (unsigned Reg = State.AllocateReg(FPR32List)) {
+    if (MCRegister Reg = State.AllocateReg(FPR32List)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -5034,7 +5034,7 @@ static bool CC_LoongArch_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
     //                        fs4,fs5,fs6,fs7
     static const MCPhysReg FPR64List[] = {LoongArch::F28_64, LoongArch::F29_64,
                                           LoongArch::F30_64, LoongArch::F31_64};
-    if (unsigned Reg = State.AllocateReg(FPR64List)) {
+    if (MCRegister Reg = State.AllocateReg(FPR64List)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 90d94e96b0efd..9059da460f135 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -39,7 +39,9 @@ MCInst LoongArchInstrInfo::getNop() const {
 void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
                                      const DebugLoc &DL, MCRegister DstReg,
-                                     MCRegister SrcReg, bool KillSrc) const {
+                                     MCRegister SrcReg, bool KillSrc,
+                                     bool RenamableDest,
+                                     bool RenamableSrc) const {
   if (LoongArch::GPRRegClass.contains(DstReg, SrcReg)) {
     BuildMI(MBB, MBBI, DL, get(LoongArch::OR), DstReg)
         .addReg(SrcReg, getKillRegState(KillSrc))
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index d66b2cb8efb33..ef9970783107e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -31,7 +31,8 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                    const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index 338db45782c96..23c5c76a47479 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -663,7 +663,8 @@ bool M68kInstrInfo::isPCRelRegisterOperandLegal(
 void M68kInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI,
                                 const DebugLoc &DL, MCRegister DstReg,
-                                MCRegister SrcReg, bool KillSrc) const {
+                                MCRegister SrcReg, bool KillSrc,
+                                bool RenamableDest, bool RenamableSrc) const {
   unsigned Opc = 0;
 
   // First deal with the normal symmetric copies.
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h
index d1e1e1cd99987..5d81956d89fdf 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.h
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.h
@@ -271,7 +271,8 @@ class M68kInstrInfo : public M68kGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   bool getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx,
                          unsigned &Size, unsigned &Offset,
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index ba7b6c85bd81a..1c7a14464d7bb 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -527,7 +527,7 @@ static void AnalyzeArguments(CCState &State,
 
     if (!UsedStack && Parts == 2 && RegsLeft == 1) {
       // Special case for 32-bit register split, see EABI section 3.3.3
-      unsigned Reg = State.AllocateReg(RegList);
+      MCRegister Reg = State.AllocateReg(RegList);
       State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
       RegsLeft -= 1;
 
@@ -535,7 +535,7 @@ static void AnalyzeArguments(CCState &State,
       CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
     } else if (Parts <= RegsLeft) {
       for (unsigned j = 0; j < Parts; j++) {
-        unsigned Reg = State.AllocateReg(RegList);
+        MCRegister Reg = State.AllocateReg(RegList);
         State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
         RegsLeft--;
       }
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 7405716516643..ae1228ceaa4e3 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -90,7 +90,8 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 void MSP430InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, MCRegister DestReg,
-                                  MCRegister SrcReg, bool KillSrc) const {
+                                  MCRegister SrcReg, bool KillSrc,
+                                  bool RenamableDest, bool RenamableSrc) const {
   unsigned Opc;
   if (MSP430::GR16RegClass.contains(DestReg, SrcReg))
     Opc = MSP430::MOV16rr;
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
index b8d015a21cd15..113a22318bec5 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -37,7 +37,8 @@ class MSP430InstrInfo : public MSP430GenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, Register SrcReg,
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 076e0a20cb97e..c50c2063ee8ed 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -5782,9 +5782,9 @@ bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       sel = 3;
       break;
   }
-  unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd;
-  unsigned Op1 =
-      IsMFTR ? rd
+  MCRegister Op0 = IsMFTR ? Inst.getOperand(0).getReg() : MCRegister(rd);
+  MCRegister Op1 =
+      IsMFTR ? MCRegister(rd)
              : (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg()
                                                  : Inst.getOperand(0).getReg());
 
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index 30ac96936de28..1bc1ed7ab93e3 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -69,7 +69,8 @@ Register Mips16InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, MCRegister DestReg,
-                                  MCRegister SrcReg, bool KillSrc) const {
+                                  MCRegister SrcReg, bool KillSrc,
+                                  bool RenamableDest, bool RenamableSrc) const {
   unsigned Opc = 0;
 
   if (Mips::CPU16RegsRegClass.contains(DestReg) &&
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h
index e8567ee3b9ce5..8e73c8079b0f8 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.h
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -50,7 +50,8 @@ class Mips16InstrInfo : public MipsInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStack(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MBBI,
diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index b2ba0f8fe74dc..311b73710fb7a 100644
--- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -1631,28 +1631,26 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
 
 void MipsConstantIslands::prescanForConstants() {
   for (MachineBasicBlock &B : *MF) {
-    for (MachineBasicBlock::instr_iterator I = B.instr_begin(),
-                                           EB = B.instr_end();
-         I != EB; ++I) {
-      switch(I->getDesc().getOpcode()) {
+    for (MachineInstr &MI : B) {
+      switch (MI.getDesc().getOpcode()) {
       case Mips::LwConstant32: {
         PrescannedForConstants = true;
-        LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n");
-        LLVM_DEBUG(dbgs() << "num operands " << I->getNumOperands() << "\n");
-        MachineOperand &Literal = I->getOperand(1);
+        LLVM_DEBUG(dbgs() << "constant island constant " << MI << "\n");
+        LLVM_DEBUG(dbgs() << "num operands " << MI.getNumOperands() << "\n");
+        MachineOperand &Literal = MI.getOperand(1);
         if (Literal.isImm()) {
           int64_t V = Literal.getImm();
           LLVM_DEBUG(dbgs() << "literal " << V << "\n");
           Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
           const Constant *C = ConstantInt::get(Int32Ty, V);
           unsigned index = MCP->getConstantPoolIndex(C, Align(4));
-          I->getOperand(2).ChangeToImmediate(index);
-          LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n");
-          I->setDesc(TII->get(Mips::LwRxPcTcp16));
-          I->removeOperand(1);
-          I->removeOperand(1);
-          I->addOperand(MachineOperand::CreateCPI(index, 0));
-          I->addOperand(MachineOperand::CreateImm(4));
+          MI.getOperand(2).ChangeToImmediate(index);
+          LLVM_DEBUG(dbgs() << "constant island constant " << MI << "\n");
+          MI.setDesc(TII->get(Mips::LwRxPcTcp16));
+          MI.removeOperand(1);
+          MI.removeOperand(1);
+          MI.addOperand(MachineOperand::CreateCPI(index, 0));
+          MI.addOperand(MachineOperand::CreateImm(4));
         }
         break;
       }
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 0f2047fcac640..fa57a3fa9b155 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -370,6 +370,9 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f64, Expand);
     setOperationAction(ISD::FMAXNUM, MVT::f64, Expand);
+  } else {
+    setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
+    setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
   }
 
   if (Subtarget.isGP64bit()) {
@@ -1251,6 +1254,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::VAARG:              return lowerVAARG(Op, DAG);
   case ISD::FCOPYSIGN:          return lowerFCOPYSIGN(Op, DAG);
   case ISD::FABS:               return lowerFABS(Op, DAG);
+  case ISD::FCANONICALIZE:
+    return lowerFCANONICALIZE(Op, DAG);
   case ISD::FRAMEADDR:          return lowerFRAMEADDR(Op, DAG);
   case ISD::RETURNADDR:         return lowerRETURNADDR(Op, DAG);
   case ISD::EH_RETURN:          return lowerEH_RETURN(Op, DAG);
@@ -2520,6 +2525,20 @@ SDValue MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
   return lowerFABS32(Op, DAG, Subtarget.hasExtractInsert());
 }
 
+SDValue MipsTargetLowering::lowerFCANONICALIZE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Operand = Op.getOperand(0);
+  SDNodeFlags Flags = Op->getFlags();
+
+  if (Flags.hasNoNaNs() || DAG.isKnownNeverNaN(Operand))
+    return Operand;
+
+  SDValue Quiet = DAG.getNode(ISD::FADD, DL, VT, Operand, Operand);
+  return DAG.getSelectCC(DL, Operand, Operand, Quiet, Operand, ISD::SETUO);
+}
+
 SDValue MipsTargetLowering::
 lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   // check the depth
@@ -2991,7 +3010,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
     } else {
       Reg = State.AllocateReg(F64Regs);
       // Shadow int registers
-      unsigned Reg2 = State.AllocateReg(IntRegs);
+      MCRegister Reg2 = State.AllocateReg(IntRegs);
       if (Reg2 == Mips::A1 || Reg2 == Mips::A3)
         State.AllocateReg(IntRegs);
       State.AllocateReg(IntRegs);
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 84ad40d6bbbe2..2b18b29918092 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -550,6 +550,7 @@ class TargetRegisterClass;
                         bool HasExtractInsert) const;
     SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
                         bool HasExtractInsert) const;
+    SDValue lowerFCANONICALIZE(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index b99ddfab2a47d..87e9ef1c26420 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -83,7 +83,8 @@ Register MipsSEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, MCRegister DestReg,
-                                  MCRegister SrcReg, bool KillSrc) const {
+                                  MCRegister SrcReg, bool KillSrc,
+                                  bool RenamableDest, bool RenamableSrc) const {
   unsigned Opc = 0, ZeroReg = 0;
   bool isMicroMips = Subtarget.inMicroMipsMode();
 
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
index a8855e26ad10f..36bddba10410c 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -44,7 +44,8 @@ class MipsSEInstrInfo : public MipsInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStack(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 673858f92e7ce..bec40874c8948 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -32,7 +32,8 @@ NVPTXInstrInfo::NVPTXInstrInfo() : RegInfo() {}
 void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  const DebugLoc &DL, MCRegister DestReg,
-                                 MCRegister SrcReg, bool KillSrc) const {
+                                 MCRegister SrcReg, bool KillSrc,
+                                 bool RenamableDest, bool RenamableSrc) const {
   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
   const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index d6cbeae6984c9..f674a00bc351b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -53,7 +53,8 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   // Branch analysis.
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index d866ef6b88a1d..cd4c76013d204 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -54,7 +54,6 @@ add_llvm_target(PowerPCCodeGen
   PPCReduceCRLogicals.cpp
   PPCVSXFMAMutate.cpp
   PPCVSXSwapRemoval.cpp
-  PPCExpandISEL.cpp
   PPCPreEmitPeephole.cpp
   PPCLowerMASSVEntries.cpp
   PPCGenScalarMASSEntries.cpp
diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.cpp b/llvm/lib/Target/PowerPC/PPCCallingConv.cpp
index 188fc96bc7c2a..d5077ab279651 100644
--- a/llvm/lib/Target/PowerPC/PPCCallingConv.cpp
+++ b/llvm/lib/Target/PowerPC/PPCCallingConv.cpp
@@ -151,7 +151,7 @@ static bool CC_PPC32_SPE_CustomSplitFP64(unsigned &ValNo, MVT &ValVT,
   static const MCPhysReg LoRegList[] = { PPC::R4, PPC::R6, PPC::R8, PPC::R10 };
 
   // Try to get the first register.
-  unsigned Reg = State.AllocateReg(HiRegList);
+  MCRegister Reg = State.AllocateReg(HiRegList);
   if (!Reg)
     return false;
 
@@ -160,7 +160,7 @@ static bool CC_PPC32_SPE_CustomSplitFP64(unsigned &ValNo, MVT &ValVT,
     if (HiRegList[i] == Reg)
       break;
 
-  unsigned T = State.AllocateReg(LoRegList[i]);
+  MCRegister T = State.AllocateReg(LoRegList[i]);
   (void)T;
   assert(T == LoRegList[i] && "Could not allocate register");
 
@@ -180,7 +180,7 @@ static bool CC_PPC32_SPE_RetF64(unsigned &ValNo, MVT &ValVT,
   static const MCPhysReg LoRegList[] = { PPC::R4 };
 
   // Try to get the first register.
-  unsigned Reg = State.AllocateReg(HiRegList, LoRegList);
+  MCRegister Reg = State.AllocateReg(HiRegList, LoRegList);
   if (!Reg)
     return false;
 
diff --git a/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp b/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
deleted file mode 100644
index 4c74e82cf0412..0000000000000
--- a/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ /dev/null
@@ -1,491 +0,0 @@
-//===------------- PPCExpandISEL.cpp - Expand ISEL instruction ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A pass that expands the ISEL instruction into an if-then-else sequence.
-// This pass must be run post-RA since all operands must be physical registers.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPC.h"
-#include "PPCInstrInfo.h"
-#include "PPCSubtarget.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "ppc-expand-isel"
-
-STATISTIC(NumExpanded, "Number of ISEL instructions expanded");
-STATISTIC(NumRemoved, "Number of ISEL instructions removed");
-STATISTIC(NumFolded, "Number of ISEL instructions folded");
-
-// If -ppc-gen-isel=false is set, we will disable generating the ISEL
-// instruction on all PPC targets. Otherwise, if the user set option
-// -misel or the platform supports ISEL by default, still generate the
-// ISEL instruction, else expand it.
-static cl::opt<bool>
-    GenerateISEL("ppc-gen-isel",
-                 cl::desc("Enable generating the ISEL instruction."),
-                 cl::init(true), cl::Hidden);
-
-namespace {
-class PPCExpandISEL : public MachineFunctionPass {
-  DebugLoc dl;
-  MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  bool IsTrueBlockRequired;
-  bool IsFalseBlockRequired;
-  MachineBasicBlock *TrueBlock;
-  MachineBasicBlock *FalseBlock;
-  MachineBasicBlock *NewSuccessor;
-  MachineBasicBlock::iterator TrueBlockI;
-  MachineBasicBlock::iterator FalseBlockI;
-
-  typedef SmallVector<MachineInstr *, 4> BlockISELList;
-  typedef SmallDenseMap<int, BlockISELList> ISELInstructionList;
-
-  // A map of MBB numbers to their lists of contained ISEL instructions.
-  // Please note when we traverse this list and expand ISEL, we only remove
-  // the ISEL from the MBB not from this list.
-  ISELInstructionList ISELInstructions;
-
-  /// Initialize the object.
-  void initialize(MachineFunction &MFParam);
-
-  void handleSpecialCases(BlockISELList &BIL, MachineBasicBlock *MBB);
-  void reorganizeBlockLayout(BlockISELList &BIL, MachineBasicBlock *MBB);
-  void populateBlocks(BlockISELList &BIL);
-  void expandMergeableISELs(BlockISELList &BIL);
-  void expandAndMergeISELs();
-
-  bool canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI);
-
-  ///  Is this instruction an ISEL or ISEL8?
-  static bool isISEL(const MachineInstr &MI) {
-    return (MI.getOpcode() == PPC::ISEL || MI.getOpcode() == PPC::ISEL8);
-  }
-
-  ///  Is this instruction an ISEL8?
-  static bool isISEL8(const MachineInstr &MI) {
-    return (MI.getOpcode() == PPC::ISEL8);
-  }
-
-  /// Are the two operands using the same register?
-  bool useSameRegister(const MachineOperand &Op1, const MachineOperand &Op2) {
-    return (Op1.getReg() == Op2.getReg());
-  }
-
-  ///
-  ///  Collect all ISEL instructions from the current function.
-  ///
-  /// Walk the current function and collect all the ISEL instructions that are
-  /// found. The instructions are placed in the ISELInstructions vector.
-  ///
-  /// \return true if any ISEL instructions were found, false otherwise
-  ///
-  bool collectISELInstructions();
-
-public:
-  static char ID;
-  PPCExpandISEL() : MachineFunctionPass(ID) {
-    initializePPCExpandISELPass(*PassRegistry::getPassRegistry());
-  }
-
-  ///
-  ///  Determine whether to generate the ISEL instruction or expand it.
-  ///
-  /// Expand ISEL instruction into if-then-else sequence when one of
-  /// the following two conditions hold:
-  /// (1) -ppc-gen-isel=false
-  /// (2) hasISEL() return false
-  /// Otherwise, still generate ISEL instruction.
-  /// The -ppc-gen-isel option is set to true by default. Which means the ISEL
-  /// instruction is still generated by default on targets that support them.
-  ///
-  /// \return true if ISEL should be expanded into if-then-else code sequence;
-  ///         false if ISEL instruction should be generated, i.e. not expanded.
-  ///
-  static bool isExpandISELEnabled(const MachineFunction &MF);
-
-#ifndef NDEBUG
-  void DumpISELInstructions() const;
-#endif
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    LLVM_DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
-    initialize(MF);
-
-    if (!collectISELInstructions()) {
-      LLVM_DEBUG(dbgs() << "No ISEL instructions in this function\n");
-      return false;
-    }
-
-#ifndef NDEBUG
-    DumpISELInstructions();
-#endif
-
-    expandAndMergeISELs();
-
-    return true;
-  }
-};
-} // end anonymous namespace
-
-void PPCExpandISEL::initialize(MachineFunction &MFParam) {
-  MF = &MFParam;
-  TII = MF->getSubtarget().getInstrInfo();
-  ISELInstructions.clear();
-}
-
-bool PPCExpandISEL::isExpandISELEnabled(const MachineFunction &MF) {
-  return !GenerateISEL || !MF.getSubtarget<PPCSubtarget>().hasISEL();
-}
-
-bool PPCExpandISEL::collectISELInstructions() {
-  for (MachineBasicBlock &MBB : *MF) {
-    BlockISELList thisBlockISELs;
-    for (MachineInstr &MI : MBB)
-      if (isISEL(MI))
-        thisBlockISELs.push_back(&MI);
-    if (!thisBlockISELs.empty())
-      ISELInstructions.insert(std::make_pair(MBB.getNumber(), thisBlockISELs));
-  }
-  return !ISELInstructions.empty();
-}
-
-#ifndef NDEBUG
-void PPCExpandISEL::DumpISELInstructions() const {
-  for (const auto &I : ISELInstructions) {
-    LLVM_DEBUG(dbgs() << printMBBReference(*MF->getBlockNumbered(I.first))
-                      << ":\n");
-    for (const auto &VI : I.second)
-      LLVM_DEBUG(dbgs() << "    "; VI->print(dbgs()));
-  }
-}
-#endif
-
-/// Contiguous ISELs that have the same condition can be merged.
-bool PPCExpandISEL::canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI) {
-  // Same Condition Register?
-  if (!useSameRegister(PrevPushedMI->getOperand(3), MI->getOperand(3)))
-    return false;
-
-  MachineBasicBlock::iterator PrevPushedMBBI = *PrevPushedMI;
-  MachineBasicBlock::iterator MBBI = *MI;
-  return (std::prev(MBBI) == PrevPushedMBBI); // Contiguous ISELs?
-}
-
-void PPCExpandISEL::expandAndMergeISELs() {
-  bool ExpandISELEnabled = isExpandISELEnabled(*MF);
-
-  for (auto &BlockList : ISELInstructions) {
-    LLVM_DEBUG(
-        dbgs() << "Expanding ISEL instructions in "
-               << printMBBReference(*MF->getBlockNumbered(BlockList.first))
-               << "\n");
-    BlockISELList &CurrentISELList = BlockList.second;
-    auto I = CurrentISELList.begin();
-    auto E = CurrentISELList.end();
-
-    while (I != E) {
-      assert(isISEL(**I) && "Expecting an ISEL instruction");
-      MachineOperand &Dest = (*I)->getOperand(0);
-      MachineOperand &TrueValue = (*I)->getOperand(1);
-      MachineOperand &FalseValue = (*I)->getOperand(2);
-
-      // Special case 1, all registers used by ISEL are the same one.
-      // The non-redundant isel 0, 0, 0, N would not satisfy these conditions
-      // as it would be ISEL %R0, %ZERO, %R0, %CRN.
-      if (useSameRegister(Dest, TrueValue) &&
-          useSameRegister(Dest, FalseValue)) {
-        LLVM_DEBUG(dbgs() << "Remove redundant ISEL instruction: " << **I
-                          << "\n");
-        // FIXME: if the CR field used has no other uses, we could eliminate the
-        // instruction that defines it. This would have to be done manually
-        // since this pass runs too late to run DCE after it.
-        NumRemoved++;
-        (*I)->eraseFromParent();
-        I++;
-      } else if (useSameRegister(TrueValue, FalseValue)) {
-        // Special case 2, the two input registers used by ISEL are the same.
-        // Note: the non-foldable isel RX, 0, 0, N would not satisfy this
-        // condition as it would be ISEL %RX, %ZERO, %R0, %CRN, which makes it
-        // safe to fold ISEL to MR(OR) instead of ADDI.
-        MachineBasicBlock *MBB = (*I)->getParent();
-        LLVM_DEBUG(
-            dbgs() << "Fold the ISEL instruction to an unconditional copy:\n");
-        LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
-        NumFolded++;
-        // Note: we're using both the TrueValue and FalseValue operands so as
-        // not to lose the kill flag if it is set on either of them.
-        BuildMI(*MBB, (*I), dl, TII->get(isISEL8(**I) ? PPC::OR8 : PPC::OR))
-            .add(Dest)
-            .add(TrueValue)
-            .add(FalseValue);
-        (*I)->eraseFromParent();
-        I++;
-      } else if (ExpandISELEnabled) { // Normal cases expansion enabled
-        LLVM_DEBUG(dbgs() << "Expand ISEL instructions:\n");
-        LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
-        BlockISELList SubISELList;
-        SubISELList.push_back(*I++);
-        // Collect the ISELs that can be merged together.
-        // This will eat up ISEL instructions without considering whether they
-        // may be redundant or foldable to a register copy. So we still keep
-        // the handleSpecialCases() downstream to handle them.
-        while (I != E && canMerge(SubISELList.back(), *I)) {
-          LLVM_DEBUG(dbgs() << "ISEL: " << **I << "\n");
-          SubISELList.push_back(*I++);
-        }
-
-        expandMergeableISELs(SubISELList);
-      } else { // Normal cases expansion disabled
-        I++; // leave the ISEL as it is
-      }
-    } // end while
-  } // end for
-}
-
-void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
-                                       MachineBasicBlock *MBB) {
-  IsTrueBlockRequired = false;
-  IsFalseBlockRequired = false;
-
-  auto MI = BIL.begin();
-  while (MI != BIL.end()) {
-    assert(isISEL(**MI) && "Expecting an ISEL instruction");
-    LLVM_DEBUG(dbgs() << "ISEL: " << **MI << "\n");
-
-    MachineOperand &Dest = (*MI)->getOperand(0);
-    MachineOperand &TrueValue = (*MI)->getOperand(1);
-    MachineOperand &FalseValue = (*MI)->getOperand(2);
-
-    // If at least one of the ISEL instructions satisfy the following
-    // condition, we need the True Block:
-    // The Dest Register and True Value Register are not the same
-    // Similarly, if at least one of the ISEL instructions satisfy the
-    // following condition, we need the False Block:
-    // The Dest Register and False Value Register are not the same.
-    bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
-    bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
-
-    // Special case 1, all registers used by ISEL are the same one.
-    if (!IsADDIInstRequired && !IsORIInstRequired) {
-      LLVM_DEBUG(dbgs() << "Remove redundant ISEL instruction.");
-      // FIXME: if the CR field used has no other uses, we could eliminate the
-      // instruction that defines it. This would have to be done manually
-      // since this pass runs too late to run DCE after it.
-      NumRemoved++;
-      (*MI)->eraseFromParent();
-      // Setting MI to the erase result keeps the iterator valid and increased.
-      MI = BIL.erase(MI);
-      continue;
-    }
-
-    // Special case 2, the two input registers used by ISEL are the same.
-    // Note 1: We favor merging ISEL expansions over folding a single one. If
-    // the passed list has multiple merge-able ISEL's, we won't fold any.
-    // Note 2: There is no need to test for PPC::R0/PPC::X0 because PPC::ZERO/
-    // PPC::ZERO8 will be used for the first operand if the value is meant to
-    // be zero. In this case, the useSameRegister method will return false,
-    // thereby preventing this ISEL from being folded.
-    if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) {
-      LLVM_DEBUG(
-          dbgs() << "Fold the ISEL instruction to an unconditional copy.");
-      NumFolded++;
-      // Note: we're using both the TrueValue and FalseValue operands so as
-      // not to lose the kill flag if it is set on either of them.
-      BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::OR8 : PPC::OR))
-          .add(Dest)
-          .add(TrueValue)
-          .add(FalseValue);
-      (*MI)->eraseFromParent();
-      // Setting MI to the erase result keeps the iterator valid and increased.
-      MI = BIL.erase(MI);
-      continue;
-    }
-
-    IsTrueBlockRequired |= IsADDIInstRequired;
-    IsFalseBlockRequired |= IsORIInstRequired;
-    MI++;
-  }
-}
-
-void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL,
-                                          MachineBasicBlock *MBB) {
-  if (BIL.empty())
-    return;
-
-  assert((IsTrueBlockRequired || IsFalseBlockRequired) &&
-         "Should have been handled by special cases earlier!");
-
-  MachineBasicBlock *Successor = nullptr;
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineBasicBlock::iterator MBBI = (*BIL.back());
-  NewSuccessor = (MBBI != MBB->getLastNonDebugInstr() || !MBB->canFallThrough())
-                     // Another BB is needed to move the instructions that
-                     // follow this ISEL.  If the ISEL is the last instruction
-                     // in a block that can't fall through, we also need a block
-                     // to branch to.
-                     ? MF->CreateMachineBasicBlock(LLVM_BB)
-                     : nullptr;
-
-  MachineFunction::iterator It = MBB->getIterator();
-  ++It; // Point to the successor block of MBB.
-
-  // If NewSuccessor is NULL then the last ISEL in this group is the last
-  // non-debug instruction in this block. Find the fall-through successor
-  // of this block to use when updating the CFG below.
-  if (!NewSuccessor) {
-    for (auto &Succ : MBB->successors()) {
-      if (MBB->isLayoutSuccessor(Succ)) {
-        Successor = Succ;
-        break;
-      }
-    }
-  } else
-    Successor = NewSuccessor;
-
-  // The FalseBlock and TrueBlock are inserted after the MBB block but before
-  // its successor.
-  // Note this need to be done *after* the above setting the Successor code.
-  if (IsFalseBlockRequired) {
-    FalseBlock = MF->CreateMachineBasicBlock(LLVM_BB);
-    MF->insert(It, FalseBlock);
-  }
-
-  if (IsTrueBlockRequired) {
-    TrueBlock = MF->CreateMachineBasicBlock(LLVM_BB);
-    MF->insert(It, TrueBlock);
-  }
-
-  if (NewSuccessor) {
-    MF->insert(It, NewSuccessor);
-
-    // Transfer the rest of this block into the new successor block.
-    NewSuccessor->splice(NewSuccessor->end(), MBB,
-                         std::next(MachineBasicBlock::iterator(BIL.back())),
-                         MBB->end());
-    NewSuccessor->transferSuccessorsAndUpdatePHIs(MBB);
-
-    // Update the liveins for NewSuccessor.
-    LivePhysRegs LPR;
-    computeAndAddLiveIns(LPR, *NewSuccessor);
-
-  } else {
-    // Remove successor from MBB.
-    MBB->removeSuccessor(Successor);
-  }
-
-  // Note that this needs to be done *after* transfering the successors from MBB
-  // to the NewSuccessor block, otherwise these blocks will also be transferred
-  // as successors!
-  MBB->addSuccessor(IsTrueBlockRequired ? TrueBlock : Successor);
-  MBB->addSuccessor(IsFalseBlockRequired ? FalseBlock : Successor);
-
-  if (IsTrueBlockRequired) {
-    TrueBlockI = TrueBlock->begin();
-    TrueBlock->addSuccessor(Successor);
-  }
-
-  if (IsFalseBlockRequired) {
-    FalseBlockI = FalseBlock->begin();
-    FalseBlock->addSuccessor(Successor);
-  }
-
-  // Conditional branch to the TrueBlock or Successor
-  BuildMI(*MBB, BIL.back(), dl, TII->get(PPC::BC))
-      .add(BIL.back()->getOperand(3))
-      .addMBB(IsTrueBlockRequired ? TrueBlock : Successor);
-
-  // Jump over the true block to the new successor if the condition is false.
-  BuildMI(*(IsFalseBlockRequired ? FalseBlock : MBB),
-          (IsFalseBlockRequired ? FalseBlockI : BIL.back()), dl,
-          TII->get(PPC::B))
-      .addMBB(Successor);
-
-  if (IsFalseBlockRequired)
-    FalseBlockI = FalseBlock->begin(); // get the position of PPC::B
-}
-
-void PPCExpandISEL::populateBlocks(BlockISELList &BIL) {
-  for (auto &MI : BIL) {
-    assert(isISEL(*MI) && "Expecting an ISEL instruction");
-
-    MachineOperand &Dest = MI->getOperand(0);       // location to store to
-    MachineOperand &TrueValue = MI->getOperand(1);  // Value to store if
-                                                       // condition is true
-    MachineOperand &FalseValue = MI->getOperand(2); // Value to store if
-                                                       // condition is false
-
-    LLVM_DEBUG(dbgs() << "Dest: " << Dest << "\n");
-    LLVM_DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n");
-    LLVM_DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n");
-    LLVM_DEBUG(dbgs() << "ConditionRegister: " << MI->getOperand(3) << "\n");
-
-    // If the Dest Register and True Value Register are not the same one, we
-    // need the True Block.
-    bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
-    bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
-
-    // Copy the result into the destination if the condition is true.
-    if (IsADDIInstRequired)
-      BuildMI(*TrueBlock, TrueBlockI, dl,
-              TII->get(isISEL8(*MI) ? PPC::ADDI8 : PPC::ADDI))
-          .add(Dest)
-          .add(TrueValue)
-          .add(MachineOperand::CreateImm(0));
-
-    // Copy the result into the destination if the condition is false.
-    if (IsORIInstRequired)
-      BuildMI(*FalseBlock, FalseBlockI, dl,
-              TII->get(isISEL8(*MI) ? PPC::ORI8 : PPC::ORI))
-          .add(Dest)
-          .add(FalseValue)
-          .add(MachineOperand::CreateImm(0));
-
-    MI->eraseFromParent(); // Remove the ISEL instruction.
-
-    NumExpanded++;
-  }
-
-  if (IsTrueBlockRequired) {
-    // Update the liveins for TrueBlock.
-    LivePhysRegs LPR;
-    computeAndAddLiveIns(LPR, *TrueBlock);
-  }
-
-  if (IsFalseBlockRequired) {
-    // Update the liveins for FalseBlock.
-    LivePhysRegs LPR;
-    computeAndAddLiveIns(LPR, *FalseBlock);
-  }
-}
-
-void PPCExpandISEL::expandMergeableISELs(BlockISELList &BIL) {
-  // At this stage all the ISELs of BIL are in the same MBB.
-  MachineBasicBlock *MBB = BIL.back()->getParent();
-
-  handleSpecialCases(BIL, MBB);
-  reorganizeBlockLayout(BIL, MBB);
-  populateBlocks(BIL);
-}
-
-INITIALIZE_PASS(PPCExpandISEL, DEBUG_TYPE, "PowerPC Expand ISEL Generation",
-                false, false)
-char PPCExpandISEL::ID = 0;
-
-FunctionPass *llvm::createPPCExpandISELPass() { return new PPCExpandISEL(); }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 459a96eca1ff2..efabfa0b511a6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -6904,7 +6904,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     while (NextReg != GPRs.size() &&
            !isGPRShadowAligned(GPRs[NextReg], ObjAlign)) {
       // Shadow allocate next registers since its aligment is not strict enough.
-      unsigned Reg = State.AllocateReg(GPRs);
+      MCRegister Reg = State.AllocateReg(GPRs);
       // Allocate the stack space shadowed by said register.
       State.AllocateStack(PtrSize, PtrAlign);
       assert(Reg && "Alocating register unexpectedly failed.");
@@ -6915,7 +6915,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     const unsigned StackSize = alignTo(ByValSize, ObjAlign);
     unsigned Offset = State.AllocateStack(StackSize, ObjAlign);
     for (const unsigned E = Offset + StackSize; Offset < E; Offset += PtrSize) {
-      if (unsigned Reg = State.AllocateReg(GPRs))
+      if (MCRegister Reg = State.AllocateReg(GPRs))
         State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
       else {
         State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
@@ -6942,7 +6942,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
       LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
                                   : CCValAssign::LocInfo::ZExt;
-    if (unsigned Reg = State.AllocateReg(GPRs))
+    if (MCRegister Reg = State.AllocateReg(GPRs))
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
     else
       State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
@@ -6957,13 +6957,13 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     // This includes f64 in 64-bit mode for ABI compatibility.
     const unsigned Offset =
         State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));
-    unsigned FReg = State.AllocateReg(FPR);
+    MCRegister FReg = State.AllocateReg(FPR);
     if (FReg)
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
 
     // Reserve and initialize GPRs or initialize the PSA as required.
     for (unsigned I = 0; I < StoreSize; I += PtrSize) {
-      if (unsigned Reg = State.AllocateReg(GPRs)) {
+      if (MCRegister Reg = State.AllocateReg(GPRs)) {
         assert(FReg && "An FPR should be available when a GPR is reserved.");
         if (State.isVarArg()) {
           // Successfully reserved GPRs are only initialized for vararg calls.
@@ -7003,7 +7003,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     if (!State.isVarArg()) {
       // If there are vector registers remaining we don't consume any stack
       // space.
-      if (unsigned VReg = State.AllocateReg(VR)) {
+      if (MCRegister VReg = State.AllocateReg(VR)) {
         State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
         return false;
       }
@@ -7021,7 +7021,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     while (NextRegIndex != GPRs.size() &&
            !isGPRShadowAligned(GPRs[NextRegIndex], VecAlign)) {
       // Shadow allocate register and its stack shadow.
-      unsigned Reg = State.AllocateReg(GPRs);
+      MCRegister Reg = State.AllocateReg(GPRs);
       State.AllocateStack(PtrSize, PtrAlign);
       assert(Reg && "Allocating register unexpectedly failed.");
       (void)Reg;
@@ -7033,7 +7033,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     // through ellipses) and shadow GPRs (unlike arguments to non-vaarg
     // functions)
     if (State.isFixed(ValNo)) {
-      if (unsigned VReg = State.AllocateReg(VR)) {
+      if (MCRegister VReg = State.AllocateReg(VR)) {
         State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
         // Shadow allocate GPRs and stack space even though we pass in a VR.
         for (unsigned I = 0; I != VecSize; I += PtrSize)
@@ -7062,8 +7062,8 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
       State.addLoc(
           CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 
-      const unsigned FirstReg = State.AllocateReg(PPC::R9);
-      const unsigned SecondReg = State.AllocateReg(PPC::R10);
+      const MCRegister FirstReg = State.AllocateReg(PPC::R9);
+      const MCRegister SecondReg = State.AllocateReg(PPC::R10);
       assert(FirstReg && SecondReg &&
              "Allocating R9 or R10 unexpectedly failed.");
       State.addLoc(
@@ -7080,7 +7080,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     State.addLoc(
         CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
     for (unsigned I = 0; I != VecSize; I += PtrSize) {
-      const unsigned Reg = State.AllocateReg(GPRs);
+      const MCRegister Reg = State.AllocateReg(GPRs);
       assert(Reg && "Failed to allocated register for vararg vector argument");
       State.addLoc(
           CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 81f16eb1a905b..48833e8f88066 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1678,7 +1678,8 @@ static unsigned getCRBitValue(unsigned CRBit) {
 void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
                                const DebugLoc &DL, MCRegister DestReg,
-                               MCRegister SrcReg, bool KillSrc) const {
+                               MCRegister SrcReg, bool KillSrc,
+                               bool RenamableDest, bool RenamableSrc) const {
   // We can end up with self copies and similar things as a result of VSX copy
   // legalization. Promote them here.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 1e2687f92c61e..40996f6fbb75e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -454,7 +454,8 @@ class PPCInstrInfo : public PPCGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 6ce345dd44138..7d0455942923d 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -141,7 +141,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
   initializePPCBSelPass(PR);
   initializePPCBranchCoalescingPass(PR);
   initializePPCBoolRetToIntPass(PR);
-  initializePPCExpandISELPass(PR);
   initializePPCPreEmitPeepholePass(PR);
   initializePPCTLSDynamicCallPass(PR);
   initializePPCMIPeepholePass(PR);
@@ -600,7 +599,6 @@ void PPCPassConfig::addPreSched2() {
 
 void PPCPassConfig::addPreEmitPass() {
   addPass(createPPCPreEmitPeepholePass());
-  addPass(createPPCExpandISELPass());
 
   if (getOptLevel() != CodeGenOptLevel::None)
     addPass(createPPCEarlyReturnPass());
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 670dee2edb1df..790107b772fcb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1285,8 +1285,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         if (VT.getVectorElementType() == MVT::bf16) {
           setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
-          // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
-          setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+          if (Subtarget.hasStdExtZfbfmin()) {
+            // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
+            setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+          } else {
+            // We need to custom legalize bf16 build vectors if Zfbfmin isn't
+            // available.
+            setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
+          }
           setOperationAction(
               {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
               Custom);
@@ -3935,9 +3941,9 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   MVT VT = Op.getSimpleValueType();
   assert(VT.isFixedLengthVector() && "Unexpected vector!");
 
-  // If we don't have scalar f16, we need to bitcast to an i16 vector.
-  if (VT.getVectorElementType() == MVT::f16 &&
-      !Subtarget.hasStdExtZfhmin())
+  // If we don't have scalar f16/bf16, we need to bitcast to an i16 vector.
+  if ((VT.getVectorElementType() == MVT::f16 && !Subtarget.hasStdExtZfhmin()) ||
+      (VT.getVectorElementType() == MVT::bf16 && !Subtarget.hasStdExtZfbfmin()))
     return lowerBUILD_VECTORvXf16(Op, DAG);
 
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
@@ -6883,10 +6889,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
-    return lowerToScalableOp(Op, DAG);
   case ISD::UADDSAT:
   case ISD::USUBSAT:
-    return lowerToScalableOp(Op, DAG);
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
     return lowerToScalableOp(Op, DAG);
@@ -18625,7 +18629,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   // Static chain parameter must not be passed in normal argument registers,
   // so we assign t2 for it as done in GCC's __builtin_call_with_static_chain
   if (ArgFlags.isNest()) {
-    if (unsigned Reg = State.AllocateReg(RISCV::X7)) {
+    if (MCRegister Reg = State.AllocateReg(RISCV::X7)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -19098,7 +19102,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
                             const RISCVTargetLowering &TLI,
                             RVVArgDispatcher &RVVDispatcher) {
   if (LocVT == MVT::i32 || LocVT == MVT::i64) {
-    if (unsigned Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
+    if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -19106,14 +19110,13 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
 
   const RISCVSubtarget &Subtarget = TLI.getSubtarget();
 
-  if (LocVT == MVT::f16 &&
-      (Subtarget.hasStdExtZfh() || Subtarget.hasStdExtZfhmin())) {
+  if (LocVT == MVT::f16 && Subtarget.hasStdExtZfhmin()) {
     static const MCPhysReg FPR16List[] = {
         RISCV::F10_H, RISCV::F11_H, RISCV::F12_H, RISCV::F13_H, RISCV::F14_H,
         RISCV::F15_H, RISCV::F16_H, RISCV::F17_H, RISCV::F0_H,  RISCV::F1_H,
         RISCV::F2_H,  RISCV::F3_H,  RISCV::F4_H,  RISCV::F5_H,  RISCV::F6_H,
         RISCV::F7_H,  RISCV::F28_H, RISCV::F29_H, RISCV::F30_H, RISCV::F31_H};
-    if (unsigned Reg = State.AllocateReg(FPR16List)) {
+    if (MCRegister Reg = State.AllocateReg(FPR16List)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -19125,7 +19128,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
         RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F,  RISCV::F1_F,
         RISCV::F2_F,  RISCV::F3_F,  RISCV::F4_F,  RISCV::F5_F,  RISCV::F6_F,
         RISCV::F7_F,  RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F};
-    if (unsigned Reg = State.AllocateReg(FPR32List)) {
+    if (MCRegister Reg = State.AllocateReg(FPR32List)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -19137,19 +19140,20 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
         RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D,  RISCV::F1_D,
         RISCV::F2_D,  RISCV::F3_D,  RISCV::F4_D,  RISCV::F5_D,  RISCV::F6_D,
         RISCV::F7_D,  RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D};
-    if (unsigned Reg = State.AllocateReg(FPR64List)) {
+    if (MCRegister Reg = State.AllocateReg(FPR64List)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
   }
 
   // Check if there is an available GPR before hitting the stack.
-  if ((LocVT == MVT::f16 &&
-       (Subtarget.hasStdExtZhinx() || Subtarget.hasStdExtZhinxmin())) ||
+  if ((LocVT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) ||
       (LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) ||
       (LocVT == MVT::f64 && Subtarget.is64Bit() &&
        Subtarget.hasStdExtZdinx())) {
-    if (unsigned Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
+    if (MCRegister Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
+      LocInfo = CCValAssign::BCvt;
+      LocVT = Subtarget.getXLenVT();
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -19184,7 +19188,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
           CCValAssign::getReg(ValNo, ValVT, AllocatedVReg, LocVT, LocInfo));
     } else {
       // Try and pass the address via a "fast" GPR.
-      if (unsigned GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
+      if (MCRegister GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
         LocInfo = CCValAssign::Indirect;
         LocVT = TLI.getSubtarget().getXLenVT();
         State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo));
@@ -19222,7 +19226,7 @@ bool RISCV::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
   if (LocVT == MVT::i32 || LocVT == MVT::i64) {
     // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, R7, SpLim
     //                        s1    s2  s3  s4  s5  s6  s7  s8  s9  s10 s11
-    if (unsigned Reg = State.AllocateReg(GPRList)) {
+    if (MCRegister Reg = State.AllocateReg(GPRList)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -19237,7 +19241,7 @@ bool RISCV::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
     static const MCPhysReg FPR32List[] = {RISCV::F8_F, RISCV::F9_F,
                                           RISCV::F18_F, RISCV::F19_F,
                                           RISCV::F20_F, RISCV::F21_F};
-    if (unsigned Reg = State.AllocateReg(FPR32List)) {
+    if (MCRegister Reg = State.AllocateReg(FPR32List)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -19249,7 +19253,7 @@ bool RISCV::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
     static const MCPhysReg FPR64List[] = {RISCV::F22_D, RISCV::F23_D,
                                           RISCV::F24_D, RISCV::F25_D,
                                           RISCV::F26_D, RISCV::F27_D};
-    if (unsigned Reg = State.AllocateReg(FPR64List)) {
+    if (MCRegister Reg = State.AllocateReg(FPR64List)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
@@ -19258,7 +19262,7 @@ bool RISCV::CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
   if ((LocVT == MVT::f32 && Subtarget.hasStdExtZfinx()) ||
       (LocVT == MVT::f64 && Subtarget.hasStdExtZdinx() &&
        Subtarget.is64Bit())) {
-    if (unsigned Reg = State.AllocateReg(GPRList)) {
+    if (MCRegister Reg = State.AllocateReg(GPRList)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 9dd79027d7a16..77072edab4d13 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -440,12 +440,14 @@ void RISCVInstrInfo::copyPhysRegVector(
 void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  const DebugLoc &DL, MCRegister DstReg,
-                                 MCRegister SrcReg, bool KillSrc) const {
+                                 MCRegister SrcReg, bool KillSrc,
+                                 bool RenamableDest, bool RenamableSrc) const {
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
   if (RISCV::GPRRegClass.contains(DstReg, SrcReg)) {
     BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
-        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SrcReg,
+                getKillRegState(KillSrc) | getRenamableRegState(RenamableSrc))
         .addImm(0);
     return;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index ecb7982b3e5e3..f25e5ee42a737 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -84,7 +84,8 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
                          const TargetRegisterClass *RegClass) const;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                    const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 1af873f85d03c..5ec07b2a0aa8f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -447,21 +447,30 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
 
         InstructionCost Cost = 0;
-        for (unsigned I = 0; I < NumRegs; ++I) {
+        for (unsigned I = 0, NumSrcRegs = divideCeil(Mask.size(), SubVF);
+             I < NumSrcRegs; ++I) {
           bool IsSingleVector = true;
           SmallVector<int> SubMask(SubVF, PoisonMaskElem);
-          transform(Mask.slice(I * SubVF,
-                               I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
-                    SubMask.begin(), [&](int I) {
-                      bool SingleSubVector = I / VF == 0;
-                      IsSingleVector &= SingleSubVector;
-                      return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
-                    });
+          transform(
+              Mask.slice(I * SubVF,
+                         I == NumSrcRegs - 1 ? Mask.size() % SubVF : SubVF),
+              SubMask.begin(), [&](int I) -> int {
+                if (I == PoisonMaskElem)
+                  return PoisonMaskElem;
+                bool SingleSubVector = I / VF == 0;
+                IsSingleVector &= SingleSubVector;
+                return (SingleSubVector ? 0 : 1) * SubVF + (I % VF) % SubVF;
+              });
+          if (all_of(enumerate(SubMask), [](auto &&P) {
+                return P.value() == PoisonMaskElem ||
+                       static_cast<unsigned>(P.value()) == P.index();
+              }))
+            continue;
           Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc
                                                 : TTI::SK_PermuteTwoSrc,
                                  SubVecTy, SubMask, CostKind, 0, nullptr);
-          return Cost;
         }
+        return Cost;
       }
       break;
     }
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 822ab492c710b..34e5d9224f715 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -131,6 +131,9 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   case RISCV::VMV_V_V:
     SrcIdx = 2;
     break;
+  case RISCV::VMERGE_VVM:
+    SrcIdx = 3; // TODO: We can also handle the false operand.
+    break;
   }
 
   MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 5c057a79afa0c..dd97263316505 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1500,7 +1500,7 @@ foreach i = ["", "2", "3", "4", "8", "16"] in {
     defm : DemangledVectorLoadStoreBuiltin<"vload_half", 2, 2, 173>;
     defm : DemangledVectorLoadStoreBuiltin<"vstore_half", 3, 3, 175>;
   } else {
-    defm : DemangledVectorLoadStoreBuiltin<!strconcat("vload_half", i), 3, 3, 174>;
+    defm : DemangledVectorLoadStoreBuiltin<!strconcat("vload_half", i), 2, 2, 174>;
     defm : DemangledVectorLoadStoreBuiltin<!strconcat("vstore_half", i), 3, 3, 177>;
   }
   defm : DemangledVectorLoadStoreBuiltin<!strconcat("vload", i), 2, 2, 171>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index d9864ab50ecfe..4175f766ac69a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/TypedPointerType.h"
 
 #include <queue>
+#include <unordered_set>
 
 // This pass performs the following transformation on LLVM IR level required
 // for the following translation to SPIR-V:
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index dac7640cdddd6..e8a6b4fdbae97 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -241,7 +241,8 @@ unsigned SPIRVInstrInfo::insertBranch(
 void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  const DebugLoc &DL, MCRegister DestReg,
-                                 MCRegister SrcReg, bool KillSrc) const {
+                                 MCRegister SrcReg, bool KillSrc,
+                                 bool RenamableDest, bool RenamableSrc) const {
   // Actually we don't need this COPY instruction. However if we do nothing with
   // it, post RA pseudo instrs expansion just removes it and we get the code
   // with undef registers. Therefore, we need to replace all uses of dst with
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 95f3874913572..67d2d979cb5a1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -51,7 +51,8 @@ class SPIRVInstrInfo : public SPIRVGenInstrInfo {
                         int *BytesAdded = nullptr) const override;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 };
 
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index 2727a9f2efbb1..0bb2540a97d72 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -438,7 +438,8 @@ bool SparcInstrInfo::isBranchOffsetInRange(unsigned BranchOpc,
 void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  const DebugLoc &DL, MCRegister DestReg,
-                                 MCRegister SrcReg, bool KillSrc) const {
+                                 MCRegister SrcReg, bool KillSrc,
+                                 bool RenamableDest, bool RenamableSrc) const {
   unsigned numSubRegs = 0;
   unsigned movOpc     = 0;
   const unsigned *subRegIdx = nullptr;
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h
index a7bb34c6c8e77..fc04542c819d4 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.h
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -87,7 +87,8 @@ class SparcInstrInfo : public SparcGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index d0758891fe570..91db858f5cdaf 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -857,7 +857,9 @@ bool SystemZInstrInfo::PredicateInstruction(
 void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    const DebugLoc &DL, MCRegister DestReg,
-                                   MCRegister SrcReg, bool KillSrc) const {
+                                   MCRegister SrcReg, bool KillSrc,
+                                   bool RenamableDest,
+                                   bool RenamableSrc) const {
   // Split 128-bit GPR moves into two 64-bit moves. Add implicit uses of the
   // super register in case one of the subregs is undefined.
   // This handles ADDR128 too.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 61338b0816155..cc8a4ccd234cd 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -276,7 +276,8 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
                             ArrayRef<MachineOperand> Pred) const override;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
                            bool isKill, int FrameIndex,
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index c001dc4d92b9a..fccbed3bdec8b 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -359,7 +359,8 @@ static void copyPhysSubRegs(MachineBasicBlock &MBB,
 void VEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I, const DebugLoc &DL,
                               MCRegister DestReg, MCRegister SrcReg,
-                              bool KillSrc) const {
+                              bool KillSrc, bool RenamableDest,
+                              bool RenamableSrc) const {
 
   if (IsAliasOfSX(SrcReg) && IsAliasOfSX(DestReg)) {
     BuildMI(MBB, I, DL, get(VE::ORri), DestReg)
diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h
index 4fcc479a13d57..3a9718f2f2603 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/llvm/lib/Target/VE/VEInstrInfo.h
@@ -81,7 +81,8 @@ class VEInstrInfo : public VEGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   /// Stack Spill & Reload {
   Register isLoadFromStackSlot(const MachineInstr &MI,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 32a4accd040eb..75011ab3c8721 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -57,7 +57,9 @@ bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
 void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
                                        const DebugLoc &DL, MCRegister DestReg,
-                                       MCRegister SrcReg, bool KillSrc) const {
+                                       MCRegister SrcReg, bool KillSrc,
+                                       bool RenamableDest,
+                                       bool RenamableSrc) const {
   // This method is called by post-RA expansion, which expects only pregs to
   // exist. However we need to handle both here.
   auto &MRI = MBB.getParent()->getRegInfo();
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index c1e1a790c60e2..8cb692f9bc0c4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -47,7 +47,8 @@ class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
   MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index eda3c9fd50bf5..864b7d8e769ab 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1956,7 +1956,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
         if (DotOffset != StringRef::npos) {
           consumeToken();
           StringRef LHS = Identifier.slice(0, DotOffset);
-          StringRef Dot = Identifier.slice(DotOffset, DotOffset + 1);
+          StringRef Dot = Identifier.substr(DotOffset, 1);
           StringRef RHS = Identifier.substr(DotOffset + 1);
           if (!RHS.empty()) {
             getLexer().UnLex(AsmToken(AsmToken::Identifier, RHS));
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 0ea51bec29b81..154cb1399880b 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -51,7 +51,7 @@ static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT,
   for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
 
     // Marking the register as located.
-    unsigned Reg = State.AllocateReg(AvailableRegs[I]);
+    MCRegister Reg = State.AllocateReg(AvailableRegs[I]);
 
     // Since we previously made sure that 2 registers are available
     // we expect that a real register number will be returned.
@@ -102,7 +102,7 @@ static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
   for (auto Reg : RegList) {
     // If the register is not marked as allocated - assign to it.
     if (!State.isAllocated(Reg)) {
-      unsigned AssigedReg = State.AllocateReg(Reg);
+      MCRegister AssigedReg = State.AllocateReg(Reg);
       assert(AssigedReg == Reg && "Expecting a valid register allocation");
       State.addLoc(
           CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
@@ -158,7 +158,7 @@ static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
 
     // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
-    if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+    if (MCRegister Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
       // In Vectorcall Calling convention, additional shadow stack can be
       // created on top of the basic 32 bytes of win64.
       // It can happen if the fifth or sixth argument is vector type or HVA.
@@ -209,7 +209,7 @@ static bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     return true; // If this is an HVA - Stop the search.
 
   // Assign XMM register.
-  if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+  if (MCRegister Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return true;
   }
@@ -259,7 +259,7 @@ static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   // If there are no pending members, we are not in the middle of a split,
   // so do the usual inreg stuff.
   if (PendingMembers.empty()) {
-    if (unsigned Reg = State.AllocateReg(RegList)) {
+    if (MCRegister Reg = State.AllocateReg(RegList)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return true;
     }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 39ba7ea777909..30428b9c3dcdd 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4293,7 +4293,8 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
 void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI,
                                const DebugLoc &DL, MCRegister DestReg,
-                               MCRegister SrcReg, bool KillSrc) const {
+                               MCRegister SrcReg, bool KillSrc,
+                               bool RenamableDest, bool RenamableSrc) const {
   // First deal with the normal symmetric copies.
   bool HasAVX = Subtarget.hasAVX();
   bool HasVLX = Subtarget.hasVLX();
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 3100a9e5699f0..1c6362f911e4a 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -418,7 +418,8 @@ class X86InstrInfo final : public X86GenInstrInfo {
                     Register FalseReg) const override;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, Register SrcReg,
                            bool isKill, int FrameIndex,
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index ae2e0fec3f899..90a195e928a59 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -331,7 +331,8 @@ XCoreInstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const {
 void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  const DebugLoc &DL, MCRegister DestReg,
-                                 MCRegister SrcReg, bool KillSrc) const {
+                                 MCRegister SrcReg, bool KillSrc,
+                                 bool RenamableDest, bool RenamableSrc) const {
   bool GRDest = XCore::GRRegsRegClass.contains(DestReg);
   bool GRSrc  = XCore::GRRegsRegClass.contains(SrcReg);
 
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h
index 1dafb6ea7d211..7f330539dd76a 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.h
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -64,7 +64,8 @@ class XCoreInstrInfo : public XCoreGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, Register SrcReg,
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index c7675c2f50176..0d2ce26a942e0 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -70,11 +70,12 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Expand);
 
-  // No sign extend instructions for i1
+  // No sign extend instructions for i1 and sign extend load i8
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
   }
 
   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index 491defb867643..2263aadcb0dd3 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -105,7 +105,8 @@ void XtensaInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
 void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MBBI,
                                   const DebugLoc &DL, MCRegister DestReg,
-                                  MCRegister SrcReg, bool KillSrc) const {
+                                  MCRegister SrcReg, bool KillSrc,
+                                  bool RenamableDest, bool RenamableSrc) const {
   // The MOV instruction is not present in core ISA,
   // so use OR instruction.
   if (Xtensa::ARRegClass.contains(DestReg, SrcReg))
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
index 37f157f832464..8bf3f0618f285 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
@@ -49,7 +49,8 @@ class XtensaInstrInfo : public XtensaGenInstrInfo {
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
-                   bool KillSrc) const override;
+                   bool KillSrc, bool RenamableDest = false,
+                   bool RenamableSrc = false) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, Register SrcReg,
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 74a71cbf101b5..dd01d143b066b 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -352,6 +352,13 @@ void FunctionImporter::ImportMapTy::maybeAddDeclaration(
   ImportMap[FromModule].try_emplace(GUID, GlobalValueSummary::Declaration);
 }
 
+SmallVector<StringRef, 0>
+FunctionImporter::ImportMapTy::getSourceModules() const {
+  SmallVector<StringRef, 0> Modules(make_first_range(ImportMap));
+  llvm::sort(Modules);
+  return Modules;
+}
+
 /// Import globals referenced by a function or other globals that are being
 /// imported, if importing such global is possible.
 class GlobalsImporter final {
@@ -1770,11 +1777,6 @@ Expected<bool> FunctionImporter::importFunctions(
   unsigned ImportedCount = 0, ImportedGVCount = 0;
 
   IRMover Mover(DestModule);
-  // Do the actual import of functions now, one Module at a time
-  std::set<StringRef> ModuleNameOrderedList;
-  for (const auto &FunctionsToImportPerModule : ImportList.getImportMap()) {
-    ModuleNameOrderedList.insert(FunctionsToImportPerModule.first);
-  }
 
   auto getImportType = [&](const FunctionsToImportTy &GUIDToImportType,
                            GlobalValue::GUID GUID)
@@ -1785,7 +1787,8 @@ Expected<bool> FunctionImporter::importFunctions(
     return Iter->second;
   };
 
-  for (const auto &Name : ModuleNameOrderedList) {
+  // Do the actual import of functions now, one Module at a time
+  for (const auto &Name : ImportList.getSourceModules()) {
     // Get the module for the import
     const auto &FunctionsToImportPerModule =
         ImportList.getImportMap().find(Name);
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index aacfe39f16fbc..8dd0cfdb2ae0a 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1658,11 +1658,17 @@ void PGOUseFunc::setBranchWeights() {
       continue;
 
     // We have a non-zero Branch BB.
-    unsigned Size = BBCountInfo.OutEdges.size();
-    SmallVector<uint64_t, 2> EdgeCounts(Size, 0);
+
+    // SuccessorCount can be greater than OutEdgesCount, because
+    // removed edges don't appear in OutEdges.
+    unsigned OutEdgesCount = BBCountInfo.OutEdges.size();
+    unsigned SuccessorCount = BB.getTerminator()->getNumSuccessors();
+    assert(OutEdgesCount <= SuccessorCount);
+
+    SmallVector<uint64_t, 2> EdgeCounts(SuccessorCount, 0);
     uint64_t MaxCount = 0;
-    for (unsigned s = 0; s < Size; s++) {
-      const PGOUseEdge *E = BBCountInfo.OutEdges[s];
+    for (unsigned It = 0; It < OutEdgesCount; It++) {
+      const PGOUseEdge *E = BBCountInfo.OutEdges[It];
       const BasicBlock *SrcBB = E->SrcBB;
       const BasicBlock *DestBB = E->DestBB;
       if (DestBB == nullptr)
diff --git a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
index 7854cf4f2c625..1707ec298d6d2 100644
--- a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
@@ -19,6 +19,8 @@
 
 #include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h"
 
+#include <cassert>
+
 using namespace llvm;
 
 static void insertCallBeforeInstruction(Function &Fn, Instruction &Instruction,
@@ -51,6 +53,7 @@ RealtimeSanitizerPass::RealtimeSanitizerPass(
 PreservedAnalyses RealtimeSanitizerPass::run(Function &F,
                                              AnalysisManager<Function> &AM) {
   if (F.hasFnAttribute(Attribute::SanitizeRealtime)) {
+    assert(!F.hasFnAttribute(Attribute::NoSanitizeRealtime));
     insertCallAtFunctionEntryPoint(F, "__rtsan_realtime_enter");
     insertCallAtAllFunctionExitPoints(F, "__rtsan_realtime_exit");
 
@@ -59,5 +62,15 @@ PreservedAnalyses RealtimeSanitizerPass::run(Function &F,
     return PA;
   }
 
+  if (F.hasFnAttribute(Attribute::NoSanitizeRealtime)) {
+    assert(!F.hasFnAttribute(Attribute::SanitizeRealtime));
+    insertCallAtFunctionEntryPoint(F, "__rtsan_off");
+    insertCallAtAllFunctionExitPoints(F, "__rtsan_on");
+
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 2b99e28acb4e9..ef6bbd37295ca 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1654,7 +1654,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   // be replacing a terminator.
   IRBuilder<> Builder(Call);
 
-  ArrayRef<Value *> GCArgs(LiveVariables);
+  ArrayRef<Value *> GCLive(LiveVariables);
   uint64_t StatepointID = StatepointDirectives::DefaultStatepointID;
   uint32_t NumPatchBytes = 0;
   uint32_t Flags = uint32_t(StatepointFlags::None);
@@ -1827,7 +1827,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   if (auto *CI = dyn_cast<CallInst>(Call)) {
     CallInst *SPCall = Builder.CreateGCStatepointCall(
         StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
-        TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
+        TransitionArgs, DeoptArgs, GCLive, "safepoint_token");
 
     SPCall->setTailCallKind(CI->getTailCallKind());
     SPCall->setCallingConv(CI->getCallingConv());
@@ -1852,8 +1852,8 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     // original block.
     InvokeInst *SPInvoke = Builder.CreateGCStatepointInvoke(
         StatepointID, NumPatchBytes, CallTarget, II->getNormalDest(),
-        II->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs,
-        "statepoint_token");
+        II->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs,
+        GCLive, "statepoint_token");
 
     SPInvoke->setCallingConv(II->getCallingConv());
 
@@ -2839,7 +2839,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
     // That Value* no longer exists and we need to use the new gc_result.
     // Thankfully, the live set is embedded in the statepoint (and updated), so
     // we just grab that.
-    llvm::append_range(Live, Info.StatepointToken->gc_args());
+    llvm::append_range(Live, Info.StatepointToken->gc_live());
 #ifndef NDEBUG
     // Do some basic validation checking on our liveness results before
     // performing relocation.  Relocation can and will turn mistakes in liveness
@@ -2847,7 +2847,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
     // TODO: It would be nice to test consistency as well
     assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
            "statepoint must be reachable or liveness is meaningless");
-    for (Value *V : Info.StatepointToken->gc_args()) {
+    for (Value *V : Info.StatepointToken->gc_live()) {
       if (!isa<Instruction>(V))
         // Non-instruction values trivial dominate all possible uses
         continue;
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 81d3243c887fc..cf00299812bb7 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -632,14 +632,17 @@ bool CodeExtractor::isEligible() const {
 }
 
 void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
-                                      const ValueSet &SinkCands) const {
+                                      const ValueSet &SinkCands,
+                                      bool CollectGlobalInputs) const {
   for (BasicBlock *BB : Blocks) {
     // If a used value is defined outside the region, it's an input.  If an
     // instruction is used outside the region, it's an output.
     for (Instruction &II : *BB) {
       for (auto &OI : II.operands()) {
         Value *V = OI;
-        if (!SinkCands.count(V) && definedInCaller(Blocks, V))
+        if (!SinkCands.count(V) &&
+            (definedInCaller(Blocks, V) ||
+             (CollectGlobalInputs && llvm::isa<llvm::GlobalVariable>(V))))
           Inputs.insert(V);
       }
 
@@ -934,6 +937,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::NoUnwind:
       case Attribute::NoSanitizeBounds:
       case Attribute::NoSanitizeCoverage:
+      case Attribute::NoSanitizeRealtime:
       case Attribute::NullPointerIsValid:
       case Attribute::OptimizeForDebugging:
       case Attribute::OptForFuzzing:
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 670d88ac7cf8f..982b1041c7c51 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -446,6 +446,12 @@ class SCCPInstVisitor : public InstVisitor<SCCPInstVisitor> {
     return markConstant(ValueState[V], V, C);
   }
 
+  bool markNotConstant(ValueLatticeElement &IV, Value *V, Constant *C);
+
+  bool markNotNull(ValueLatticeElement &IV, Value *V) {
+    return markNotConstant(IV, V, Constant::getNullValue(V->getType()));
+  }
+
   /// markConstantRange - Mark the object as constant range with \p CR. If the
   /// object is not a constant range with the range \p CR, add it to the
   /// instruction work list so that the users of the instruction are updated
@@ -667,6 +673,7 @@ class SCCPInstVisitor : public InstVisitor<SCCPInstVisitor> {
   void visitStoreInst(StoreInst &I);
   void visitLoadInst(LoadInst &I);
   void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitAllocaInst(AllocaInst &AI);
 
   void visitInvokeInst(InvokeInst &II) {
     visitCallBase(II);
@@ -820,7 +827,11 @@ class SCCPInstVisitor : public InstVisitor<SCCPInstVisitor> {
         return;
       }
     }
-    // Assume nothing about the incoming arguments without range.
+    if (A->hasNonNullAttr()) {
+      markNotNull(ValueState[A], A);
+      return;
+    }
+    // Assume nothing about the incoming arguments without attributes.
     markOverdefined(A);
   }
 
@@ -905,6 +916,15 @@ bool SCCPInstVisitor::markConstant(ValueLatticeElement &IV, Value *V,
   return true;
 }
 
+bool SCCPInstVisitor::markNotConstant(ValueLatticeElement &IV, Value *V,
+                                      Constant *C) {
+  if (!IV.markNotConstant(C))
+    return false;
+  LLVM_DEBUG(dbgs() << "markNotConstant: " << *C << ": " << *V << '\n');
+  pushToWorkList(IV, V);
+  return true;
+}
+
 bool SCCPInstVisitor::markConstantRange(ValueLatticeElement &IV, Value *V,
                                         const ConstantRange &CR) {
   if (!IV.markConstantRange(CR))
@@ -1364,7 +1384,7 @@ void SCCPInstVisitor::visitInsertValueInst(InsertValueInst &IVI) {
 
   // resolvedUndefsIn might mark I as overdefined. Bail out, even if we would
   // discover a concrete value later.
-  if (SCCPSolver::isOverdefined(ValueState[&IVI]))
+  if (ValueState[&IVI].isOverdefined())
     return (void)markOverdefined(&IVI);
 
   // If this has more than one index, we can't handle it, drive all results to
@@ -1436,7 +1456,7 @@ void SCCPInstVisitor::visitUnaryOperator(Instruction &I) {
   ValueLatticeElement &IV = ValueState[&I];
   // resolvedUndefsIn might mark I as overdefined. Bail out, even if we would
   // discover a concrete value later.
-  if (SCCPSolver::isOverdefined(IV))
+  if (IV.isOverdefined())
     return (void)markOverdefined(&I);
 
   // If something is unknown/undef, wait for it to resolve.
@@ -1461,7 +1481,7 @@ void SCCPInstVisitor::visitFreezeInst(FreezeInst &I) {
   ValueLatticeElement &IV = ValueState[&I];
   // resolvedUndefsIn might mark I as overdefined. Bail out, even if we would
   // discover a concrete value later.
-  if (SCCPSolver::isOverdefined(IV))
+  if (IV.isOverdefined())
     return (void)markOverdefined(&I);
 
   // If something is unknown/undef, wait for it to resolve.
@@ -1541,7 +1561,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
 void SCCPInstVisitor::visitCmpInst(CmpInst &I) {
   // Do not cache this lookup, getValueState calls later in the function might
   // invalidate the reference.
-  if (SCCPSolver::isOverdefined(ValueState[&I]))
+  if (ValueState[&I].isOverdefined())
     return (void)markOverdefined(&I);
 
   Value *Op1 = I.getOperand(0);
@@ -1571,9 +1591,22 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) {
 // Handle getelementptr instructions.  If all operands are constants then we
 // can turn this into a getelementptr ConstantExpr.
 void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
-  if (SCCPSolver::isOverdefined(ValueState[&I]))
+  if (ValueState[&I].isOverdefined())
     return (void)markOverdefined(&I);
 
+  const ValueLatticeElement &PtrState = getValueState(I.getPointerOperand());
+  if (PtrState.isUnknownOrUndef())
+    return;
+
+  // gep inbounds/nuw of non-null is non-null.
+  if (PtrState.isNotConstant() && PtrState.getNotConstant()->isNullValue()) {
+    if (I.hasNoUnsignedWrap() ||
+        (I.isInBounds() &&
+         !NullPointerIsDefined(I.getFunction(), I.getAddressSpace())))
+      return (void)markNotNull(ValueState[&I], &I);
+    return (void)markOverdefined(&I);
+  }
+
   SmallVector<Constant *, 8> Operands;
   Operands.reserve(I.getNumOperands());
 
@@ -1582,9 +1615,6 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
     if (State.isUnknownOrUndef())
       return; // Operands are not resolved yet.
 
-    if (SCCPSolver::isOverdefined(State))
-      return (void)markOverdefined(&I);
-
     if (Constant *C = getConstant(State, I.getOperand(i)->getType())) {
       Operands.push_back(C);
       continue;
@@ -1597,6 +1627,13 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
     markConstant(&I, C);
 }
 
+void SCCPInstVisitor::visitAllocaInst(AllocaInst &I) {
+  if (!NullPointerIsDefined(I.getFunction(), I.getAddressSpace()))
+    return (void)markNotNull(ValueState[&I], &I);
+
+  markOverdefined(&I);
+}
+
 void SCCPInstVisitor::visitStoreInst(StoreInst &SI) {
   // If this store is of a struct, ignore it.
   if (SI.getOperand(0)->getType()->isStructTy())
@@ -1618,18 +1655,23 @@ void SCCPInstVisitor::visitStoreInst(StoreInst &SI) {
 }
 
 static ValueLatticeElement getValueFromMetadata(const Instruction *I) {
-  if (I->getType()->isIntOrIntVectorTy()) {
-    if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range))
-      return ValueLatticeElement::getRange(
-          getConstantRangeFromMetadata(*Ranges));
-
-    if (const auto *CB = dyn_cast<CallBase>(I))
+  if (const auto *CB = dyn_cast<CallBase>(I)) {
+    if (CB->getType()->isIntOrIntVectorTy())
       if (std::optional<ConstantRange> Range = CB->getRange())
         return ValueLatticeElement::getRange(*Range);
+    if (CB->getType()->isPointerTy() && CB->isReturnNonNull())
+      return ValueLatticeElement::getNot(
+          ConstantPointerNull::get(cast<PointerType>(I->getType())));
   }
+
+  if (I->getType()->isIntOrIntVectorTy())
+    if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range))
+      return ValueLatticeElement::getRange(
+          getConstantRangeFromMetadata(*Ranges));
   if (I->hasMetadata(LLVMContext::MD_nonnull))
     return ValueLatticeElement::getNot(
         ConstantPointerNull::get(cast<PointerType>(I->getType())));
+
   return ValueLatticeElement::getOverdefined();
 }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index fb2efe581ac6b..1e6dc88ed9353 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1454,10 +1454,12 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) {
     if (NonContRanges > 2)
       return nullptr;
 
+    // Slice off the character's high end bits.
+    CharVal = B.CreateTrunc(CharVal, B.getInt8Ty());
+
     SmallVector<Value *> CharCompares;
     for (unsigned char C : SortedStr)
-      CharCompares.push_back(
-          B.CreateICmpEQ(CharVal, ConstantInt::get(CharVal->getType(), C)));
+      CharCompares.push_back(B.CreateICmpEQ(CharVal, B.getInt8(C)));
 
     return B.CreateIntToPtr(B.CreateOr(CharCompares), CI->getType());
   }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index def73e8d0c0db..f471d2b1561d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4991,9 +4991,9 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
     auto *Mid = std::stable_partition(
         Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; });
     DenseMap<Value *, DenseMap<Value *, bool>> LessThan;
-    for (auto I = Begin; I < Mid; ++I)
+    for (auto *I = Begin; I < Mid; ++I)
       LessThan.try_emplace(std::get<1>(*I));
-    for (auto I = Begin; I < Mid; ++I) {
+    for (auto *I = Begin; I < Mid; ++I) {
       Value *V = std::get<1>(*I);
       while (auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
         V = Gep->getOperand(0);
@@ -9455,7 +9455,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   auto It = MinBWs.find(E);
   Type *OrigScalarTy = ScalarTy;
   if (It != MinBWs.end()) {
-    auto VecTy = dyn_cast<FixedVectorType>(ScalarTy);
+    auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
     if (VecTy)
       ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
@@ -13133,7 +13133,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     ScalarTy = IE->getOperand(1)->getType();
   auto It = MinBWs.find(E);
   if (It != MinBWs.end()) {
-    auto VecTy = dyn_cast<FixedVectorType>(ScalarTy);
+    auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
     if (VecTy)
       ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
@@ -13765,6 +13765,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
           return E->VectorizedValue;
         }
+        if (isa<FixedVectorType>(ScalarTy)) {
+          assert(SLPReVec && "FixedVectorType is not expected.");
+          // CreateMaskedGather expects VecTy and VecPtr have same size. We need
+          // to expand VecPtr if ScalarTy is a vector type.
+          unsigned ScalarTyNumElements =
+              cast<FixedVectorType>(ScalarTy)->getNumElements();
+          unsigned VecTyNumElements =
+              cast<FixedVectorType>(VecTy)->getNumElements();
+          assert(VecTyNumElements % ScalarTyNumElements == 0 &&
+                 "Cannot expand getelementptr.");
+          unsigned VF = VecTyNumElements / ScalarTyNumElements;
+          SmallVector<Constant *> Indices(VecTyNumElements);
+          transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
+            return Builder.getInt64(I % ScalarTyNumElements);
+          });
+          VecPtr = Builder.CreateGEP(
+              VecTy->getElementType(),
+              Builder.CreateShuffleVector(
+                  VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
+              ConstantVector::get(Indices));
+        }
         // Use the minimum alignment of the gathered loads.
         Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
         NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
@@ -15975,6 +15996,10 @@ void BoUpSLP::computeMinimumValueSizes() {
                     auto It = MinBWs.find(TE);
                     if (It != MinBWs.end() && It->second.first > UserTESz)
                       return true;
+                    // The size of icmp is always 1 and should not be
+                    // considered.
+                    if (TE->getOpcode() == Instruction::ICmp)
+                      return true;
                     return DL->getTypeSizeInBits(U->getType()) > UserTESz;
                   }));
         })) {
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 809b15b200495..81d8b01fe7fb7 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -130,8 +130,16 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
 ; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:  Unknown data dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.i64 = load i64, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
+; CHECK-EMPTY:
 ; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
 ; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
 ; CHECK-NEXT:            store double %val, ptr %gep.iv.101.i64, align 8
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
index 845ff078ee0eb..416742a94e0d3 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
@@ -45,8 +45,13 @@ exit:
 define void @different_non_constant_strides_known_backward_distance_larger_than_trip_count(ptr %A) {
 ; CHECK-LABEL: 'different_non_constant_strides_known_backward_distance_larger_than_trip_count'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep.mul.2, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index 4402289ac170d..835622276ef27 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -511,6 +511,12 @@ define void @f92() sanitize_realtime
         ret void;
 }
 
+; CHECK: define void @f93() #54
+define void @f93() nosanitize_realtime
+{
+        ret void;
+}
+
 ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]]
 define void @f87() fn_ret_thunk_extern { ret void }
 
@@ -606,6 +612,7 @@ define void @initializes(ptr initializes((-4, 0), (4, 8)) %a) {
 ; CHECK: attributes #51 = { uwtable(sync) }
 ; CHECK: attributes #52 = { nosanitize_bounds }
 ; CHECK: attributes #53 = { sanitize_realtime }
+; CHECK: attributes #54 = { nosanitize_realtime }
 ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern }
 ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile }
 ; CHECK: attributes [[OPTDEBUG]] = { optdebug }
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index fd60c49a4be39..c401cde8e146e 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1562,7 +1562,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #53
+  ; CHECK: call void @f.nobuiltin() #54
 
   call fastcc noalias ptr @f.noalias() noinline
   ; CHECK: call fastcc noalias ptr @f.noalias() #12
@@ -1992,6 +1992,9 @@ declare void @f.sanitize_numerical_stability() sanitize_numerical_stability
 declare void @f.sanitize_realtime() sanitize_realtime
 ; CHECK: declare void @f.sanitize_realtime() #52
 
+declare void @f.nosanitize_realtime() nosanitize_realtime
+; CHECK: declare void @f.nosanitize_realtime() #53
+
 ; CHECK: declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 
@@ -2115,7 +2118,8 @@ define float @nofpclass_callsites(float %arg) {
 ; CHECK: attributes #50 = { allockind("alloc,uninitialized") }
 ; CHECK: attributes #51 = { sanitize_numerical_stability }
 ; CHECK: attributes #52 = { sanitize_realtime }
-; CHECK: attributes #53 = { builtin }
+; CHECK: attributes #53 = { nosanitize_realtime }
+; CHECK: attributes #54 = { builtin }
 
 ;; Metadata
 
diff --git a/llvm/test/CodeGen/AArch64/sve-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-bitcast.ll
index 95f43ba512632..5d12d41ac3332 100644
--- a/llvm/test/CodeGen/AArch64/sve-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bitcast.ll
@@ -13,14 +13,8 @@ define <vscale x 16 x i8> @bitcast_nxv8i16_to_nxv16i8(<vscale x 8 x i16> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv16i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
@@ -33,14 +27,8 @@ define <vscale x 16 x i8> @bitcast_nxv4i32_to_nxv16i8(<vscale x 4 x i32> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv16i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
@@ -53,14 +41,8 @@ define <vscale x 16 x i8> @bitcast_nxv2i64_to_nxv16i8(<vscale x 2 x i64> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv16i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
@@ -73,14 +55,8 @@ define <vscale x 16 x i8> @bitcast_nxv8f16_to_nxv16i8(<vscale x 8 x half> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv16i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
@@ -93,14 +69,8 @@ define <vscale x 16 x i8> @bitcast_nxv4f32_to_nxv16i8(<vscale x 4 x float> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv16i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
@@ -113,14 +83,8 @@ define <vscale x 16 x i8> @bitcast_nxv2f64_to_nxv16i8(<vscale x 2 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv16i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
@@ -133,14 +97,8 @@ define <vscale x 16 x i8> @bitcast_nxv8bf16_to_nxv16i8(<vscale x 8 x bfloat> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv16i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
@@ -157,14 +115,8 @@ define <vscale x 8 x i16> @bitcast_nxv16i8_to_nxv8i16(<vscale x 16 x i8> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv8i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
@@ -177,14 +129,10 @@ define <vscale x 8 x i16> @bitcast_nxv4i32_to_nxv8i16(<vscale x 4 x i32> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv8i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
@@ -197,14 +145,10 @@ define <vscale x 8 x i16> @bitcast_nxv2i64_to_nxv8i16(<vscale x 2 x i64> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv8i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
@@ -217,13 +161,6 @@ define <vscale x 8 x i16> @bitcast_nxv8f16_to_nxv8i16(<vscale x 8 x half> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv8i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
@@ -236,14 +173,10 @@ define <vscale x 8 x i16> @bitcast_nxv4f32_to_nxv8i16(<vscale x 4 x float> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv8i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
@@ -256,14 +189,10 @@ define <vscale x 8 x i16> @bitcast_nxv2f64_to_nxv8i16(<vscale x 2 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv8i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
@@ -276,13 +205,6 @@ define <vscale x 8 x i16> @bitcast_nxv8bf16_to_nxv8i16(<vscale x 8 x bfloat> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv8i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
@@ -299,14 +221,8 @@ define <vscale x 4 x i32> @bitcast_nxv16i8_to_nxv4i32(<vscale x 16 x i8> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv4i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
@@ -319,14 +235,10 @@ define <vscale x 4 x i32> @bitcast_nxv8i16_to_nxv4i32(<vscale x 8 x i16> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv4i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
@@ -339,14 +251,10 @@ define <vscale x 4 x i32> @bitcast_nxv2i64_to_nxv4i32(<vscale x 2 x i64> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv4i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
@@ -359,14 +267,10 @@ define <vscale x 4 x i32> @bitcast_nxv8f16_to_nxv4i32(<vscale x 8 x half> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv4i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
@@ -379,13 +283,6 @@ define <vscale x 4 x i32> @bitcast_nxv4f32_to_nxv4i32(<vscale x 4 x float> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv4i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
@@ -398,14 +295,10 @@ define <vscale x 4 x i32> @bitcast_nxv2f64_to_nxv4i32(<vscale x 2 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv4i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
@@ -418,14 +311,10 @@ define <vscale x 4 x i32> @bitcast_nxv8bf16_to_nxv4i32(<vscale x 8 x bfloat> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv4i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
@@ -442,14 +331,8 @@ define <vscale x 2 x i64> @bitcast_nxv16i8_to_nxv2i64(<vscale x 16 x i8> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv2i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
@@ -462,14 +345,10 @@ define <vscale x 2 x i64> @bitcast_nxv8i16_to_nxv2i64(<vscale x 8 x i16> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv2i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
@@ -482,14 +361,10 @@ define <vscale x 2 x i64> @bitcast_nxv4i32_to_nxv2i64(<vscale x 4 x i32> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv2i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
@@ -502,14 +377,10 @@ define <vscale x 2 x i64> @bitcast_nxv8f16_to_nxv2i64(<vscale x 8 x half> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv2i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
@@ -522,14 +393,10 @@ define <vscale x 2 x i64> @bitcast_nxv4f32_to_nxv2i64(<vscale x 4 x float> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv2i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
@@ -542,13 +409,6 @@ define <vscale x 2 x i64> @bitcast_nxv2f64_to_nxv2i64(<vscale x 2 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv2i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
@@ -561,14 +421,10 @@ define <vscale x 2 x i64> @bitcast_nxv8bf16_to_nxv2i64(<vscale x 8 x bfloat> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv2i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
@@ -585,14 +441,8 @@ define <vscale x 8 x half> @bitcast_nxv16i8_to_nxv8f16(<vscale x 16 x i8> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv8f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
@@ -605,13 +455,6 @@ define <vscale x 8 x half> @bitcast_nxv8i16_to_nxv8f16(<vscale x 8 x i16> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv8f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
@@ -624,14 +467,10 @@ define <vscale x 8 x half> @bitcast_nxv4i32_to_nxv8f16(<vscale x 4 x i32> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv8f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
@@ -644,14 +483,10 @@ define <vscale x 8 x half> @bitcast_nxv2i64_to_nxv8f16(<vscale x 2 x i64> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv8f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
@@ -664,14 +499,10 @@ define <vscale x 8 x half> @bitcast_nxv4f32_to_nxv8f16(<vscale x 4 x float> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv8f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
@@ -684,14 +515,10 @@ define <vscale x 8 x half> @bitcast_nxv2f64_to_nxv8f16(<vscale x 2 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv8f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
@@ -704,13 +531,6 @@ define <vscale x 8 x half> @bitcast_nxv8bf16_to_nxv8f16(<vscale x 8 x bfloat> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv8f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
@@ -727,14 +547,8 @@ define <vscale x 4 x float> @bitcast_nxv16i8_to_nxv4f32(<vscale x 16 x i8> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv4f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
@@ -747,14 +561,10 @@ define <vscale x 4 x float> @bitcast_nxv8i16_to_nxv4f32(<vscale x 8 x i16> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv4f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
@@ -767,13 +577,6 @@ define <vscale x 4 x float> @bitcast_nxv4i32_to_nxv4f32(<vscale x 4 x i32> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv4f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
@@ -786,14 +589,10 @@ define <vscale x 4 x float> @bitcast_nxv2i64_to_nxv4f32(<vscale x 2 x i64> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv4f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
@@ -806,14 +605,10 @@ define <vscale x 4 x float> @bitcast_nxv8f16_to_nxv4f32(<vscale x 8 x half> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv4f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
@@ -826,14 +621,10 @@ define <vscale x 4 x float> @bitcast_nxv2f64_to_nxv4f32(<vscale x 2 x double> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv4f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
@@ -846,14 +637,10 @@ define <vscale x 4 x float> @bitcast_nxv8bf16_to_nxv4f32(<vscale x 8 x bfloat> %
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv4f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
@@ -870,14 +657,8 @@ define <vscale x 2 x double> @bitcast_nxv16i8_to_nxv2f64(<vscale x 16 x i8> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv2f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
@@ -890,14 +671,10 @@ define <vscale x 2 x double> @bitcast_nxv8i16_to_nxv2f64(<vscale x 8 x i16> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv2f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
@@ -910,14 +687,10 @@ define <vscale x 2 x double> @bitcast_nxv4i32_to_nxv2f64(<vscale x 4 x i32> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv2f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
@@ -930,13 +703,6 @@ define <vscale x 2 x double> @bitcast_nxv2i64_to_nxv2f64(<vscale x 2 x i64> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv2f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
@@ -949,14 +715,10 @@ define <vscale x 2 x double> @bitcast_nxv8f16_to_nxv2f64(<vscale x 8 x half> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv2f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
@@ -969,14 +731,10 @@ define <vscale x 2 x double> @bitcast_nxv4f32_to_nxv2f64(<vscale x 4 x float> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv2f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
@@ -989,14 +747,10 @@ define <vscale x 2 x double> @bitcast_nxv8bf16_to_nxv2f64(<vscale x 8 x bfloat>
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv2f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
@@ -1013,14 +767,8 @@ define <vscale x 8 x bfloat> @bitcast_nxv16i8_to_nxv8bf16(<vscale x 16 x i8> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv8bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
@@ -1033,13 +781,6 @@ define <vscale x 8 x bfloat> @bitcast_nxv8i16_to_nxv8bf16(<vscale x 8 x i16> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv8bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
@@ -1052,14 +793,10 @@ define <vscale x 8 x bfloat> @bitcast_nxv4i32_to_nxv8bf16(<vscale x 4 x i32> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv8bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
@@ -1072,14 +809,10 @@ define <vscale x 8 x bfloat> @bitcast_nxv2i64_to_nxv8bf16(<vscale x 2 x i64> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv8bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
@@ -1092,13 +825,6 @@ define <vscale x 8 x bfloat> @bitcast_nxv8f16_to_nxv8bf16(<vscale x 8 x half> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv8bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
@@ -1111,14 +837,10 @@ define <vscale x 8 x bfloat> @bitcast_nxv4f32_to_nxv8bf16(<vscale x 4 x float> %
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv8bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
@@ -1131,14 +853,10 @@ define <vscale x 8 x bfloat> @bitcast_nxv2f64_to_nxv8bf16(<vscale x 2 x double>
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv8bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
@@ -1212,15 +930,9 @@ define <vscale x 8 x i8> @bitcast_nxv1i64_to_nxv8i8(<vscale x 1 x i64> %v) #0 {
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv8i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
@@ -1290,15 +1002,9 @@ define <vscale x 8 x i8> @bitcast_nxv1f64_to_nxv8i8(<vscale x 1 x double> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv8i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
@@ -1400,15 +1106,11 @@ define <vscale x 4 x i16> @bitcast_nxv1i64_to_nxv4i16(<vscale x 1 x i64> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv4i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
@@ -1420,15 +1122,11 @@ define <vscale x 4 x i16> @bitcast_nxv4f16_to_nxv4i16(<vscale x 4 x half> %v) #0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv4i16:
-; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
@@ -1470,15 +1168,11 @@ define <vscale x 4 x i16> @bitcast_nxv1f64_to_nxv4i16(<vscale x 1 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv4i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
@@ -1491,14 +1185,10 @@ define <vscale x 4 x i16> @bitcast_nxv4bf16_to_nxv4i16(<vscale x 4 x bfloat> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv4i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
@@ -1572,15 +1262,11 @@ define <vscale x 2 x i32> @bitcast_nxv1i64_to_nxv2i32(<vscale x 1 x i64> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv2i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
@@ -1621,14 +1307,10 @@ define <vscale x 2 x i32> @bitcast_nxv2f32_to_nxv2i32(<vscale x 2 x float> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv2i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
@@ -1642,15 +1324,11 @@ define <vscale x 2 x i32> @bitcast_nxv1f64_to_nxv2i32(<vscale x 1 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv2i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
@@ -1696,15 +1374,9 @@ define <vscale x 1 x i64> @bitcast_nxv8i8_to_nxv1i64(<vscale x 8 x i8> %v) #0 {
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv1i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
@@ -1718,15 +1390,11 @@ define <vscale x 1 x i64> @bitcast_nxv4i16_to_nxv1i64(<vscale x 4 x i16> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv1i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
@@ -1740,15 +1408,11 @@ define <vscale x 1 x i64> @bitcast_nxv2i32_to_nxv1i64(<vscale x 2 x i32> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv1i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
@@ -1762,20 +1426,14 @@ define <vscale x 1 x i64> @bitcast_nxv4f16_to_nxv1i64(<vscale x 4 x half> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv1i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-3
 ; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    ptrue p1.d
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    revb z0.s, p1/m, z0.s
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #3
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
@@ -1789,19 +1447,13 @@ define <vscale x 1 x i64> @bitcast_nxv2f32_to_nxv1i64(<vscale x 2 x float> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv1i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-3
 ; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    revb z0.d, p1/m, z0.d
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #3
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    revb z0.d, p1/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
@@ -1814,13 +1466,6 @@ define <vscale x 1 x i64> @bitcast_nxv1f64_to_nxv1i64(<vscale x 1 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv1i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
@@ -1834,20 +1479,14 @@ define <vscale x 1 x i64> @bitcast_nxv4bf16_to_nxv1i64(<vscale x 4 x bfloat> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv1i64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-3
 ; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    ptrue p1.d
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    revb z0.s, p1/m, z0.s
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #3
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
@@ -1892,14 +1531,10 @@ define <vscale x 4 x half> @bitcast_nxv4i16_to_nxv4f16(<vscale x 4 x i16> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv4f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
@@ -1941,15 +1576,11 @@ define <vscale x 4 x half> @bitcast_nxv1i64_to_nxv4f16(<vscale x 1 x i64> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv4f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
@@ -1991,15 +1622,11 @@ define <vscale x 4 x half> @bitcast_nxv1f64_to_nxv4f16(<vscale x 1 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv4f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
@@ -2012,13 +1639,6 @@ define <vscale x 4 x half> @bitcast_nxv4bf16_to_nxv4f16(<vscale x 4 x bfloat> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv4f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
@@ -2091,14 +1711,10 @@ define <vscale x 2 x float> @bitcast_nxv2i32_to_nxv2f32(<vscale x 2 x i32> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv2f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
@@ -2112,15 +1728,11 @@ define <vscale x 2 x float> @bitcast_nxv1i64_to_nxv2f32(<vscale x 1 x i64> %v) #
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv2f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
@@ -2162,15 +1774,11 @@ define <vscale x 2 x float> @bitcast_nxv1f64_to_nxv2f32(<vscale x 1 x double> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv2f32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
@@ -2216,15 +1824,9 @@ define <vscale x 1 x double> @bitcast_nxv8i8_to_nxv1f64(<vscale x 8 x i8> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv1f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
@@ -2238,15 +1840,11 @@ define <vscale x 1 x double> @bitcast_nxv4i16_to_nxv1f64(<vscale x 4 x i16> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv1f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
@@ -2260,15 +1858,11 @@ define <vscale x 1 x double> @bitcast_nxv2i32_to_nxv1f64(<vscale x 2 x i32> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv1f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
@@ -2281,13 +1875,6 @@ define <vscale x 1 x double> @bitcast_nxv1i64_to_nxv1f64(<vscale x 1 x i64> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv1f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
@@ -2301,20 +1888,14 @@ define <vscale x 1 x double> @bitcast_nxv4f16_to_nxv1f64(<vscale x 4 x half> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv1f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-3
 ; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    ptrue p1.d
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    revb z0.s, p1/m, z0.s
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #3
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
@@ -2328,19 +1909,13 @@ define <vscale x 1 x double> @bitcast_nxv2f32_to_nxv1f64(<vscale x 2 x float> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv1f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-3
 ; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    revb z0.d, p1/m, z0.d
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #3
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    revb z0.d, p1/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
@@ -2354,20 +1929,14 @@ define <vscale x 1 x double> @bitcast_nxv4bf16_to_nxv1f64(<vscale x 4 x bfloat>
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv1f64:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-3
 ; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    ptrue p1.d
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    revb z0.s, p1/m, z0.s
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #3
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
@@ -2412,14 +1981,10 @@ define <vscale x 4 x bfloat> @bitcast_nxv4i16_to_nxv4bf16(<vscale x 4 x i16> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv4bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
@@ -2461,15 +2026,11 @@ define <vscale x 4 x bfloat> @bitcast_nxv1i64_to_nxv4bf16(<vscale x 1 x i64> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv4bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
@@ -2482,13 +2043,6 @@ define <vscale x 4 x bfloat> @bitcast_nxv4f16_to_nxv4bf16(<vscale x 4 x half> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv4bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
@@ -2530,15 +2084,11 @@ define <vscale x 4 x bfloat> @bitcast_nxv1f64_to_nxv4bf16(<vscale x 1 x double>
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv4bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
@@ -2585,16 +2135,10 @@ define <vscale x 4 x i8> @bitcast_nxv1i32_to_nxv4i8(<vscale x 1 x i32> %v) #0 {
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i32_to_nxv4i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i32> %v to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %bc
@@ -2699,16 +2243,12 @@ define <vscale x 2 x i16> @bitcast_nxv1i32_to_nxv2i16(<vscale x 1 x i32> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i32_to_nxv2i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i32> %v to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %bc
@@ -2721,14 +2261,10 @@ define <vscale x 2 x i16> @bitcast_nxv2f16_to_nxv2i16(<vscale x 2 x half> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f16_to_nxv2i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x half> %v to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %bc
@@ -2743,14 +2279,10 @@ define <vscale x 2 x i16> @bitcast_nxv2bf16_to_nxv2i16(<vscale x 2 x bfloat> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2bf16_to_nxv2i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.d
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x bfloat> %v to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %bc
@@ -2769,16 +2301,10 @@ define <vscale x 1 x i32> @bitcast_nxv4i8_to_nxv1i32(<vscale x 4 x i8> %v) #0 {
 ;
 ; CHECK_BE-LABEL: bitcast_nxv4i8_to_nxv1i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.s
+; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i8> %v to <vscale x 1 x i32>
   ret <vscale x 1 x i32> %bc
@@ -2793,16 +2319,12 @@ define <vscale x 1 x i32> @bitcast_nxv2i16_to_nxv1i32(<vscale x 2 x i16> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i16_to_nxv1i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i16> %v to <vscale x 1 x i32>
   ret <vscale x 1 x i32> %bc
@@ -2824,15 +2346,15 @@ define <vscale x 1 x i32> @bitcast_nxv2f16_to_nxv1i32(<vscale x 2 x half> %v) #0
 ; CHECK_BE-LABEL: bitcast_nxv2f16_to_nxv1i32:
 ; CHECK_BE:       // %bb.0:
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-2
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
 ; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp]
 ; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #2
+; CHECK_BE-NEXT:    revb z0.h, p1/m, z0.h
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x half> %v to <vscale x 1 x i32>
@@ -2857,15 +2379,15 @@ define <vscale x 1 x i32> @bitcast_nxv2bf16_to_nxv1i32(<vscale x 2 x bfloat> %v)
 ; CHECK_BE-LABEL: bitcast_nxv2bf16_to_nxv1i32:
 ; CHECK_BE:       // %bb.0:
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-2
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
 ; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp]
 ; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #2
+; CHECK_BE-NEXT:    revb z0.h, p1/m, z0.h
+; CHECK_BE-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x bfloat> %v to <vscale x 1 x i32>
@@ -2911,14 +2433,10 @@ define <vscale x 2 x half> @bitcast_nxv2i16_to_nxv2f16(<vscale x 2 x i16> %v) #0
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i16_to_nxv2f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i16> %v to <vscale x 2 x half>
   ret <vscale x 2 x half> %bc
@@ -2934,13 +2452,6 @@ define <vscale x 2 x half> @bitcast_nxv2bf16_to_nxv2f16(<vscale x 2 x bfloat> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2bf16_to_nxv2f16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x bfloat> %v to <vscale x 2 x half>
   ret <vscale x 2 x half> %bc
@@ -2995,14 +2506,10 @@ define <vscale x 2 x bfloat> @bitcast_nxv2i16_to_nxv2bf16(<vscale x 2 x i16> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i16_to_nxv2bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i16> %v to <vscale x 2 x bfloat>
   ret <vscale x 2 x bfloat> %bc
@@ -3017,13 +2524,6 @@ define <vscale x 2 x bfloat> @bitcast_nxv2f16_to_nxv2bf16(<vscale x 2 x half> %v
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2f16_to_nxv2bf16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x half> %v to <vscale x 2 x bfloat>
   ret <vscale x 2 x bfloat> %bc
@@ -3045,17 +2545,11 @@ define <vscale x 2 x i8> @bitcast_nxv1i16_to_nxv2i8(<vscale x 1 x i16> %v) #0 {
 ;
 ; CHECK_BE-LABEL: bitcast_nxv1i16_to_nxv2i8:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ptrue p1.b
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i16> %v to <vscale x 2 x i8>
   ret <vscale x 2 x i8> %bc
@@ -3078,17 +2572,11 @@ define <vscale x 1 x i16> @bitcast_nxv2i8_to_nxv1i16(<vscale x 2 x i8> %v) #0 {
 ;
 ; CHECK_BE-LABEL: bitcast_nxv2i8_to_nxv1i16:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ptrue p1.h
+; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i8> %v to <vscale x 1 x i16>
   ret <vscale x 1 x i16> %bc
@@ -3126,15 +2614,11 @@ define <vscale x 2 x i32> @bitcast_short_float_to_i32(<vscale x 2 x double> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_short_float_to_i32:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
 ; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    fcvt z0.s, p0/m, z0.d
-; CHECK_BE-NEXT:    st1w { z0.s }, p1, [sp]
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    revb z0.s, p1/m, z0.s
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK_BE-NEXT:    ret
   %trunc = fptrunc <vscale x 2 x double> %v to <vscale x 2 x float>
   %bitcast = bitcast <vscale x 2 x float> %trunc to <vscale x 2 x i32>
@@ -3150,15 +2634,11 @@ define <vscale x 2 x double> @bitcast_short_i32_to_float(<vscale x 2 x i64> %v)
 ;
 ; CHECK_BE-LABEL: bitcast_short_i32_to_float:
 ; CHECK_BE:       // %bb.0:
-; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
 ; CHECK_BE-NEXT:    ptrue p1.s
-; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK_BE-NEXT:    revb z0.s, p1/m, z0.s
 ; CHECK_BE-NEXT:    fcvt z0.d, p0/m, z0.s
-; CHECK_BE-NEXT:    addvl sp, sp, #1
-; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
   %trunc = trunc <vscale x 2 x i64> %v to <vscale x 2 x i32>
   %bitcast = bitcast <vscale x 2 x i32> %trunc to <vscale x 2 x float>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
index 7730dde20395d..cd07de690cbde 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
@@ -17,20 +17,22 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN-NEXT: %1:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0 = COPY %1
+    ; GCN-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
+    ;
     ; VI-LABEL: name: fptosi_s32_to_s32_vv
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; VI-NEXT: %1:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %1
+    ; VI-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s32_to_s32_vv
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: %1:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY %1
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FPTOSI %0
     $vgpr0 = COPY %1
@@ -50,20 +52,22 @@ body: |
     ; GCN: liveins: $sgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN-NEXT: %1:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0 = COPY %1
+    ; GCN-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
+    ;
     ; VI-LABEL: name: fptosi_s32_to_s32_vs
     ; VI: liveins: $sgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; VI-NEXT: %1:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %1
+    ; VI-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s32_to_s32_vs
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: %1:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY %1
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = G_FPTOSI %0
     $vgpr0 = COPY %1
@@ -83,20 +87,22 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0 = COPY %2
+    ; GCN-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
+    ;
     ; VI-LABEL: name: fptosi_s32_to_s32_fneg_vv
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; VI-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %2
+    ; VI-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s32_to_s32_fneg_vv
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY %2
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = G_FNEG %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -117,23 +123,25 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN-NEXT: %3:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %3, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0 = COPY %2
+    ; GCN-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
     ; VI-LABEL: name: fptosi_s16_to_s32_vv
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; VI-NEXT: %3:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %3, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %2
+    ; VI-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s16_to_s32_vv
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: %3:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %3, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY %2
+    ; GFX11-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -154,23 +162,25 @@ body: |
     ; GCN: liveins: $sgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN-NEXT: %3:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %3, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0 = COPY %2
+    ; GCN-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
     ; VI-LABEL: name: fptosi_s16_to_s32_vs
     ; VI: liveins: $sgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; VI-NEXT: %3:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %3, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %2
+    ; VI-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s16_to_s32_vs
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: %3:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %3, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY %2
+    ; GFX11-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -193,27 +203,29 @@ body: |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
     ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GCN-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %3:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0 = COPY %3
+    ; GCN-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
     ; VI-LABEL: name: fptosi_s16_to_s32_fneg_vv
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; VI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
     ; VI-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; VI-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: %3:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; VI-NEXT: $vgpr0 = COPY %3
+    ; VI-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s16_to_s32_fneg_vv
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
     ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: %3:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY %3
+    ; GFX11-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FNEG %1
@@ -235,23 +247,25 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0, implicit %2
+    ; GCN-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ;
     ; VI-LABEL: name: fptosi_s16_to_s1_vv
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; VI-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; VI-NEXT: S_ENDPGM 0, implicit %2
+    ; VI-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s16_to_s1_vv
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit %2
+    ; GFX11-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -273,23 +287,25 @@ body: |
     ; GCN: liveins: $sgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GCN-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0, implicit %2
+    ; GCN-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ;
     ; VI-LABEL: name: fptosi_s16_to_s1_vs
     ; VI: liveins: $sgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; VI-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; VI-NEXT: S_ENDPGM 0, implicit %2
+    ; VI-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s16_to_s1_vs
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: %4:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: %2:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %4, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit %2
+    ; GFX11-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -313,27 +329,29 @@ body: |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
     ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GCN-NEXT: %5:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %3:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %5, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0, implicit %3
+    ; GCN-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ;
     ; VI-LABEL: name: fptosi_s16_to_s1_fneg_vv
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; VI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
     ; VI-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; VI-NEXT: %5:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; VI-NEXT: %3:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %5, implicit $mode, implicit $exec
-    ; VI-NEXT: S_ENDPGM 0, implicit %3
+    ; VI-NEXT: [[V_CVT_F32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
+    ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ;
     ; GFX11-LABEL: name: fptosi_s16_to_s1_fneg_vv
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
     ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: %5:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: %3:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %5, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit %3
+    ; GFX11-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FNEG %1
diff --git a/llvm/test/CodeGen/DirectX/CreateHandle.ll b/llvm/test/CodeGen/DirectX/CreateHandle.ll
index 13d59c6caf6c9..40b3b2c712272 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandle.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandle.ll
@@ -1,4 +1,14 @@
-; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+; RUN: opt -S -passes=dxil-op-lower,dxil-translate-metadata %s | FileCheck %s
+; RUN: opt -S -passes=dxil-pretty-printer %s 2>&1 >/dev/null | FileCheck --check-prefix=CHECK-PRETTY %s
+
+; CHECK-PRETTY:       Type  Format         Dim      ID      HLSL Bind     Count
+; CHECK-PRETTY: ---------- ------- ----------- ------- -------------- ---------
+; CHECK-PRETTY:        SRV     f32         buf      T0      t0        unbounded
+; CHECK-PRETTY:        SRV    byte         r/o      T1      t8,space1         1
+; CHECK-PRETTY:        SRV  struct         r/o      T2      t2,space4         1
+; CHECK-PRETTY:        SRV     u32         buf      T3      t3,space5        24
+; CHECK-PRETTY:        UAV     i32         buf      U0      u7,space2         1
+; CHECK-PRETTY:        UAV     f32         buf      U1      u5,space3         1
 
 target triple = "dxil-pc-shadermodel6.0-compute"
 
@@ -50,4 +60,12 @@ define void @test_buffers() {
   ret void
 }
 
+; Just check that we have the right types and number of metadata nodes, the
+; contents of the metadata are tested elsewhere.
+;
+; CHECK: !dx.resources = !{[[RESMD:![0-9]+]]}
+; CHECK: [[RESMD]] = !{[[SRVMD:![0-9]+]], [[UAVMD:![0-9]+]], null, null}
+; CHECK-DAG: [[SRVMD]] = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
+; CHECK-DAG: [[UAVMD]] = !{!{{[0-9]+}}, !{{[0-9]+}}}
+
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
index e78a0bf02e4ae..d0c80c018b8d7 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
@@ -1,4 +1,14 @@
-; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+; RUN: opt -S -passes=dxil-op-lower,dxil-translate-metadata %s | FileCheck %s
+; RUN: opt -S -passes=dxil-pretty-printer %s 2>&1 >/dev/null | FileCheck --check-prefix=CHECK-PRETTY %s
+
+; CHECK-PRETTY:       Type  Format         Dim      ID      HLSL Bind     Count
+; CHECK-PRETTY: ---------- ------- ----------- ------- -------------- ---------
+; CHECK-PRETTY:        SRV     f32         buf      T0      t0        unbounded
+; CHECK-PRETTY:        SRV    byte         r/o      T1      t8,space1         1
+; CHECK-PRETTY:        SRV  struct         r/o      T2      t2,space4         1
+; CHECK-PRETTY:        SRV     u32         buf      T3      t3,space5        24
+; CHECK-PRETTY:        UAV     i32         buf      U0      u7,space2         1
+; CHECK-PRETTY:        UAV     f32         buf      U1      u5,space3         1
 
 target triple = "dxil-pc-shadermodel6.6-compute"
 
@@ -55,4 +65,12 @@ define void @test_bindings() {
   ret void
 }
 
+; Just check that we have the right types and number of metadata nodes, the
+; contents of the metadata are tested elsewhere.
+;
+; CHECK: !dx.resources = !{[[RESMD:![0-9]+]]}
+; CHECK: [[RESMD]] = !{[[SRVMD:![0-9]+]], [[UAVMD:![0-9]+]], null, null}
+; CHECK-DAG: [[SRVMD]] = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
+; CHECK-DAG: [[UAVMD]] = !{!{{[0-9]+}}, !{{[0-9]+}}}
+
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/DirectX/all.ll b/llvm/test/CodeGen/DirectX/all.ll
new file mode 100644
index 0000000000000..1c0b6486dc935
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/all.ll
@@ -0,0 +1,83 @@
+; RUN: opt -S -passes=dxil-intrinsic-expansion,dxil-op-lower -mtriple=dxil-pc-shadermodel6.0-library < %s | FileCheck %s
+
+; Make sure dxil operation function calls for all are generated for float and half.
+
+; CHECK-LABEL: all_bool
+; CHECK: icmp ne i1 %{{.*}}, false
+define noundef i1 @all_bool(i1 noundef %p0) {
+entry:
+  %dx.all = call i1 @llvm.dx.all.i1(i1 %p0)
+  ret i1 %dx.all
+}
+
+; CHECK-LABEL: all_int64_t
+; CHECK: icmp ne i64 %{{.*}}, 0
+define noundef i1 @all_int64_t(i64 noundef %p0) {
+entry:
+  %dx.all = call i1 @llvm.dx.all.i64(i64 %p0)
+  ret i1 %dx.all
+}
+
+; CHECK-LABEL: all_int
+; CHECK: icmp ne i32 %{{.*}}, 0
+define noundef i1 @all_int(i32 noundef %p0) {
+entry:
+  %dx.all = call i1 @llvm.dx.all.i32(i32 %p0)
+  ret i1 %dx.all
+}
+
+; CHECK-LABEL: all_int16_t
+; CHECK: icmp ne i16 %{{.*}}, 0
+define noundef i1 @all_int16_t(i16 noundef %p0) {
+entry:
+  %dx.all = call i1 @llvm.dx.all.i16(i16 %p0)
+  ret i1 %dx.all
+}
+
+; CHECK-LABEL: all_double
+; CHECK: fcmp une double %{{.*}}, 0.000000e+00
+define noundef i1 @all_double(double noundef %p0) {
+entry:
+  %dx.all = call i1 @llvm.dx.all.f64(double %p0)
+  ret i1 %dx.all
+}
+
+; CHECK-LABEL: all_float
+; CHECK: fcmp une float %{{.*}}, 0.000000e+00
+define noundef i1 @all_float(float noundef %p0) {
+entry:
+  %dx.all = call i1 @llvm.dx.all.f32(float %p0)
+  ret i1 %dx.all
+}
+
+; CHECK-LABEL: all_half
+; CHECK: fcmp une half %{{.*}}, 0xH0000
+define noundef i1 @all_half(half noundef %p0) {
+entry:
+  %dx.all = call i1 @llvm.dx.all.f16(half %p0)
+  ret i1 %dx.all
+}
+
+; CHECK-LABEL: all_bool4
+; CHECK: icmp ne <4 x i1> %{{.*}}, zeroinitialize
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 0
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 1
+; CHECK: and i1  %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 2
+; CHECK: and i1  %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 3
+; CHECK: and i1  %{{.*}}, %{{.*}}
+define noundef i1 @all_bool4(<4 x i1> noundef %p0) {
+entry:
+  %dx.all = call i1 @llvm.dx.all.v4i1(<4 x i1> %p0)
+  ret i1 %dx.all
+}
+
+declare i1 @llvm.dx.all.v4i1(<4 x i1>)
+declare i1 @llvm.dx.all.i1(i1)
+declare i1 @llvm.dx.all.i16(i16)
+declare i1 @llvm.dx.all.i32(i32)
+declare i1 @llvm.dx.all.i64(i64)
+declare i1 @llvm.dx.all.f16(half)
+declare i1 @llvm.dx.all.f32(float)
+declare i1 @llvm.dx.all.f64(double)
diff --git a/llvm/test/CodeGen/MLRegAlloc/Inputs/interactive_main.py b/llvm/test/CodeGen/MLRegAlloc/Inputs/interactive_main.py
index 53809b0a04008..1f62a5c6c9a3b 100644
--- a/llvm/test/CodeGen/MLRegAlloc/Inputs/interactive_main.py
+++ b/llvm/test/CodeGen/MLRegAlloc/Inputs/interactive_main.py
@@ -2,6 +2,7 @@
 import interactive_host
 import sys
 
+from typing import Sequence
 
 def main(args):
     # this advisor just picks the first legal register to evict, which is
@@ -9,7 +10,7 @@ def main(args):
     class Advisor:
         to_return = False
 
-        def advice(self, tensor_values: list[log_reader.TensorValue]):
+        def advice(self, tensor_values: Sequence[log_reader.TensorValue]):
             for tv in tensor_values:
                 if tv.spec().name != "mask":
                     continue
diff --git a/llvm/test/CodeGen/MLRegAlloc/lit.local.cfg b/llvm/test/CodeGen/MLRegAlloc/lit.local.cfg
deleted file mode 100644
index e8c7912650cb8..0000000000000
--- a/llvm/test/CodeGen/MLRegAlloc/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-import sys
-
-config.unsupported = sys.version_info.minor <= 8
diff --git a/llvm/test/CodeGen/Mips/fp-fcanonicalize.ll b/llvm/test/CodeGen/Mips/fp-fcanonicalize.ll
new file mode 100644
index 0000000000000..1faf6dd0891a2
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/fp-fcanonicalize.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=mipsisa32r6 < %s | FileCheck %s --check-prefix=MIPS32R6
+; RUN: llc --mtriple=mips < %s | FileCheck %s --check-prefix=MIPS32R2
+; RUN: llc --mtriple=mips64 < %s | FileCheck %s --check-prefix=MIPS64R2
+
+declare float @llvm.fcanonicalize.f32(float)
+declare double @llvm.fcanonicalize.f64(double)
+
+define float @fcanonicalize_float(float %x) {
+; MIPS32R6-LABEL: fcanonicalize_float:
+; MIPS32R6:       # %bb.0:
+; MIPS32R6-NEXT:    jr $ra
+; MIPS32R6-NEXT:    min.s $f0, $f12, $f12
+;
+; MIPS32R2-LABEL: fcanonicalize_float:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.s $f0, $f12
+; MIPS32R2-NEXT:    add.s $f1, $f12, $f12
+; MIPS32R2-NEXT:    c.un.s $f12, $f12
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.s $f0, $f1, $fcc0
+;
+; MIPS64R2-LABEL: fcanonicalize_float:
+; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.s $f0, $f12
+; MIPS64R2-NEXT:    add.s $f1, $f12, $f12
+; MIPS64R2-NEXT:    c.un.s $f12, $f12
+; MIPS64R2-NEXT:    jr $ra
+; MIPS64R2-NEXT:    movt.s $f0, $f1, $fcc0
+  %z = call float @llvm.canonicalize.f32(float %x)
+  ret float %z
+}
+
+define float @fcanonicalize_float_nnan(float %x) {
+; MIPS32R6-LABEL: fcanonicalize_float_nnan:
+; MIPS32R6:       # %bb.0:
+; MIPS32R6-NEXT:    jr $ra
+; MIPS32R6-NEXT:    min.s $f0, $f12, $f12
+;
+; MIPS32R2-LABEL: fcanonicalize_float_nnan:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    mov.s $f0, $f12
+;
+; MIPS64R2-LABEL: fcanonicalize_float_nnan:
+; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    jr $ra
+; MIPS64R2-NEXT:    mov.s $f0, $f12
+  %z = call nnan float @llvm.canonicalize.f32(float %x)
+  ret float %z
+}
+
+
+define double @fcanonicalize_double(double %x) {
+; MIPS32R6-LABEL: fcanonicalize_double:
+; MIPS32R6:       # %bb.0:
+; MIPS32R6-NEXT:    jr $ra
+; MIPS32R6-NEXT:    min.d $f0, $f12, $f12
+;
+; MIPS32R2-LABEL: fcanonicalize_double:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    mov.d $f0, $f12
+; MIPS32R2-NEXT:    add.d $f2, $f12, $f12
+; MIPS32R2-NEXT:    c.un.d $f12, $f12
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    movt.d $f0, $f2, $fcc0
+;
+; MIPS64R2-LABEL: fcanonicalize_double:
+; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    mov.d $f0, $f12
+; MIPS64R2-NEXT:    add.d $f1, $f12, $f12
+; MIPS64R2-NEXT:    c.un.d $f12, $f12
+; MIPS64R2-NEXT:    jr $ra
+; MIPS64R2-NEXT:    movt.d $f0, $f1, $fcc0
+  %z = call double @llvm.canonicalize.f64(double %x)
+  ret double %z
+}
+
+define double @fcanonicalize_double_nnan(double %x) {
+; MIPS32R6-LABEL: fcanonicalize_double_nnan:
+; MIPS32R6:       # %bb.0:
+; MIPS32R6-NEXT:    jr $ra
+; MIPS32R6-NEXT:    min.d $f0, $f12, $f12
+;
+; MIPS32R2-LABEL: fcanonicalize_double_nnan:
+; MIPS32R2:       # %bb.0:
+; MIPS32R2-NEXT:    jr $ra
+; MIPS32R2-NEXT:    mov.d $f0, $f12
+;
+; MIPS64R2-LABEL: fcanonicalize_double_nnan:
+; MIPS64R2:       # %bb.0:
+; MIPS64R2-NEXT:    jr $ra
+; MIPS64R2-NEXT:    mov.d $f0, $f12
+  %z = call nnan double @llvm.canonicalize.f64(double %x)
+  ret double %z
+}
+
diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
index 70b421f8c0c5f..4a17384e49993 100644
--- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
@@ -59,7 +59,6 @@
 ; CHECK-NEXT:       Insert XRay ops
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       PowerPC Pre-Emit Peephole
-; CHECK-NEXT:       PowerPC Expand ISEL Generation
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 60d42704ca795..39b23a57513d9 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -212,7 +212,6 @@
 ; CHECK-NEXT:       Insert XRay ops
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       PowerPC Pre-Emit Peephole
-; CHECK-NEXT:       PowerPC Expand ISEL Generation
 ; CHECK-NEXT:       PowerPC Early-Return Creation
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       StackMap Liveness Analysis
diff --git a/llvm/test/CodeGen/PowerPC/crbit-asm.ll b/llvm/test/CodeGen/PowerPC/crbit-asm.ll
index 617d6ec27b63f..2062aa3e34417 100644
--- a/llvm/test/CodeGen/PowerPC/crbit-asm.ll
+++ b/llvm/test/CodeGen/PowerPC/crbit-asm.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -O1 -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false  < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
-; RUN: llc -verify-machineinstrs -O1 -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-isel  < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -O1 -mcpu=pwr7 -mattr=-isel < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
 
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
@@ -26,15 +26,13 @@ define zeroext i1 @testi1(i1 zeroext %b1, i1 zeroext %b2) #0 {
 ; CHECK-NO-ISEL-NEXT:    andi. 3, 3, 1
 ; CHECK-NO-ISEL-NEXT:    crmove 20, 1
 ; CHECK-NO-ISEL-NEXT:    andi. 3, 4, 1
-; CHECK-NO-ISEL-NEXT:    li 3, 0
-; CHECK-NO-ISEL-NEXT:    li 4, 1
+; CHECK-NO-ISEL-NEXT:    li 3, 1
 ; CHECK-NO-ISEL-NEXT:    #APP
 ; CHECK-NO-ISEL-NEXT:    crand 20, 20, 1
 ; CHECK-NO-ISEL-NEXT:    #NO_APP
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB0_1
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB0_1: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 4, 0
+; CHECK-NO-ISEL-NEXT:    bclr 12, 20, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 0
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %0 = tail call i8 asm "crand $0, $1, $2", "=^wc,^wc,^wc"(i1 %b1, i1 %b2) #0
@@ -63,15 +61,13 @@ define signext i32 @testi32(i32 signext %b1, i32 signext %b2) #0 {
 ; CHECK-NO-ISEL-NEXT:    andi. 3, 3, 1
 ; CHECK-NO-ISEL-NEXT:    crmove 20, 1
 ; CHECK-NO-ISEL-NEXT:    andi. 3, 4, 1
-; CHECK-NO-ISEL-NEXT:    li 3, 0
-; CHECK-NO-ISEL-NEXT:    li 4, -1
+; CHECK-NO-ISEL-NEXT:    li 3, -1
 ; CHECK-NO-ISEL-NEXT:    #APP
 ; CHECK-NO-ISEL-NEXT:    crand 20, 20, 1
 ; CHECK-NO-ISEL-NEXT:    #NO_APP
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB1_1
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB1_1: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 4, 0
+; CHECK-NO-ISEL-NEXT:    bclr 12, 20, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 0
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %0 = tail call i32 asm "crand $0, $1, $2", "=^wc,^wc,^wc"(i32 %b1, i32 %b2) #0
@@ -101,15 +97,13 @@ define zeroext i8 @testi8(i8 zeroext %b1, i8 zeroext %b2) #0 {
 ; CHECK-NO-ISEL-NEXT:    andi. 3, 3, 1
 ; CHECK-NO-ISEL-NEXT:    crmove 20, 1
 ; CHECK-NO-ISEL-NEXT:    andi. 3, 4, 1
-; CHECK-NO-ISEL-NEXT:    li 3, 0
-; CHECK-NO-ISEL-NEXT:    li 4, 1
+; CHECK-NO-ISEL-NEXT:    li 3, 1
 ; CHECK-NO-ISEL-NEXT:    #APP
 ; CHECK-NO-ISEL-NEXT:    crand 20, 20, 1
 ; CHECK-NO-ISEL-NEXT:    #NO_APP
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB2_1
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB2_1: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 4, 0
+; CHECK-NO-ISEL-NEXT:    bclr 12, 20, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 0
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %0 = tail call i8 asm "crand $0, $1, $2", "=^wc,^wc,^wc"(i8 %b1, i8 %b2) #0
diff --git a/llvm/test/CodeGen/PowerPC/crbits.ll b/llvm/test/CodeGen/PowerPC/crbits.ll
index a682f69a2ceb7..763f596777a64 100644
--- a/llvm/test/CodeGen/PowerPC/crbits.ll
+++ b/llvm/test/CodeGen/PowerPC/crbits.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -ppc-gpr-icmps=all -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:     -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
 ; RUN: llc -ppc-gpr-icmps=all -mtriple=powerpc64-unknown-linux-gnu \
-; RUN:     -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | \
+; RUN:     -verify-machineinstrs -mcpu=pwr7 -mattr=-isel < %s | \
 ; RUN:     FileCheck --check-prefix=CHECK-NO-ISEL %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
 ; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 -ppc-gpr-icmps=none < %s | \
@@ -30,16 +30,16 @@ define zeroext i1 @test1(float %v1, float %v2) #0 {
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    fcmpu 0, 1, 2
 ; CHECK-NO-ISEL-NEXT:    xxlxor 0, 0, 0
-; CHECK-NO-ISEL-NEXT:    li 3, 1
+; CHECK-NO-ISEL-NEXT:    li 3, 0
 ; CHECK-NO-ISEL-NEXT:    fcmpu 1, 2, 2
 ; CHECK-NO-ISEL-NEXT:    crnor 20, 3, 0
 ; CHECK-NO-ISEL-NEXT:    fcmpu 0, 2, 0
-; CHECK-NO-ISEL-NEXT:    crnor 21, 7, 1
-; CHECK-NO-ISEL-NEXT:    crnand 20, 20, 21
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB0_1
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB0_1: # %entry
-; CHECK-NO-ISEL-NEXT:    li 3, 0
+; CHECK-NO-ISEL-NEXT:    bclr 4, 20, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    crnor 20, 7, 1
+; CHECK-NO-ISEL-NEXT:    bclr 4, 20, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 1
 ; CHECK-NO-ISEL-NEXT:    blr
 ;
 ; CHECK-P10-LABEL: test1:
@@ -81,16 +81,15 @@ define zeroext i1 @test2(float %v1, float %v2) #0 {
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    fcmpu 0, 1, 2
 ; CHECK-NO-ISEL-NEXT:    xxlxor 0, 0, 0
-; CHECK-NO-ISEL-NEXT:    li 3, 1
+; CHECK-NO-ISEL-NEXT:    li 3, 0
 ; CHECK-NO-ISEL-NEXT:    fcmpu 1, 2, 2
 ; CHECK-NO-ISEL-NEXT:    crnor 20, 3, 0
 ; CHECK-NO-ISEL-NEXT:    fcmpu 0, 2, 0
 ; CHECK-NO-ISEL-NEXT:    crnor 21, 7, 1
 ; CHECK-NO-ISEL-NEXT:    creqv 20, 20, 21
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB1_1
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB1_1: # %entry
-; CHECK-NO-ISEL-NEXT:    li 3, 0
+; CHECK-NO-ISEL-NEXT:    bclr 12, 20, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 1
 ; CHECK-NO-ISEL-NEXT:    blr
 ;
 ; CHECK-P10-LABEL: test2:
@@ -134,7 +133,7 @@ define zeroext i1 @test3(float %v1, float %v2, i32 signext %x) #0 {
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    fcmpu 0, 1, 2
 ; CHECK-NO-ISEL-NEXT:    xxlxor 0, 0, 0
-; CHECK-NO-ISEL-NEXT:    li 3, 1
+; CHECK-NO-ISEL-NEXT:    li 3, 0
 ; CHECK-NO-ISEL-NEXT:    fcmpu 1, 2, 2
 ; CHECK-NO-ISEL-NEXT:    crnor 20, 3, 0
 ; CHECK-NO-ISEL-NEXT:    fcmpu 0, 2, 0
@@ -142,10 +141,9 @@ define zeroext i1 @test3(float %v1, float %v2, i32 signext %x) #0 {
 ; CHECK-NO-ISEL-NEXT:    cmpwi 5, -2
 ; CHECK-NO-ISEL-NEXT:    crandc 21, 21, 2
 ; CHECK-NO-ISEL-NEXT:    creqv 20, 20, 21
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB2_1
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB2_1: # %entry
-; CHECK-NO-ISEL-NEXT:    li 3, 0
+; CHECK-NO-ISEL-NEXT:    bclr 12, 20, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 1
 ; CHECK-NO-ISEL-NEXT:    blr
 ;
 ; CHECK-P10-LABEL: test3:
@@ -301,10 +299,9 @@ define signext i32 @test7(i1 zeroext %v2, i32 signext %i1, i32 signext %i2) #0 {
 ; CHECK-NO-ISEL-NEXT:    andi. 3, 3, 1
 ; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB6_2
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 5, 0
-; CHECK-NO-ISEL-NEXT:    blr
+; CHECK-NO-ISEL-NEXT:    mr 4, 5
 ; CHECK-NO-ISEL-NEXT:  .LBB6_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 4, 0
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
 ; CHECK-NO-ISEL-NEXT:    blr
 ;
 ; CHECK-P10-LABEL: test7:
@@ -330,12 +327,10 @@ define signext i32 @exttest7(i32 signext %a) #0 {
 ; CHECK-NO-ISEL-LABEL: exttest7:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmplwi 3, 5
+; CHECK-NO-ISEL-NEXT:    li 3, 7
+; CHECK-NO-ISEL-NEXT:    beqlr 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
 ; CHECK-NO-ISEL-NEXT:    li 3, 8
-; CHECK-NO-ISEL-NEXT:    li 4, 7
-; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB7_1
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB7_1: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 4, 0
 ; CHECK-NO-ISEL-NEXT:    blr
 ;
 ; CHECK-P10-LABEL: exttest7:
@@ -366,15 +361,15 @@ define zeroext i32 @exttest8() #0 {
 ; CHECK-NO-ISEL-LABEL: exttest8:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    ld 3, 0(3)
+; CHECK-NO-ISEL-NEXT:    li 4, 0
 ; CHECK-NO-ISEL-NEXT:    subfic 3, 3, 80
 ; CHECK-NO-ISEL-NEXT:    rldicl 3, 3, 63, 1
 ; CHECK-NO-ISEL-NEXT:    cmplwi 3, 80
-; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB8_1
-; CHECK-NO-ISEL-NEXT:    b .LBB8_2
-; CHECK-NO-ISEL-NEXT:  .LBB8_1: # %entry
-; CHECK-NO-ISEL-NEXT:    li 3, 0
+; CHECK-NO-ISEL-NEXT:    bgt 0, .LBB8_2
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 4, 3
 ; CHECK-NO-ISEL-NEXT:  .LBB8_2: # %entry
-; CHECK-NO-ISEL-NEXT:    clrldi 3, 3, 32
+; CHECK-NO-ISEL-NEXT:    clrldi 3, 4, 32
 ; CHECK-NO-ISEL-NEXT:    blr
 ;
 ; CHECK-P10-LABEL: exttest8:
diff --git a/llvm/test/CodeGen/PowerPC/expand-contiguous-isel.ll b/llvm/test/CodeGen/PowerPC/expand-contiguous-isel.ll
index 15b7dc1a38fab..9e53c7e88b0e3 100644
--- a/llvm/test/CodeGen/PowerPC/expand-contiguous-isel.ll
+++ b/llvm/test/CodeGen/PowerPC/expand-contiguous-isel.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
 ; This file mainly tests that one of the ISEL instruction in the group uses the same register for operand RT, RA, RB
@@ -13,8 +14,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; After that we have:
 ; updated: 1504B	%vreg83<def> = ISEL8 %vreg83, %vreg83, %vreg33:sub_eq
 
-; RUN: llc -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -ppc-gen-isel=true < %s | FileCheck %s --check-prefix=CHECK-GEN-ISEL-TRUE
-; RUN: llc -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck %s --implicit-check-not isel
+; RUN: llc -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -mattr=+isel < %s | FileCheck %s --check-prefix=CHECK-GEN-ISEL-TRUE
+; RUN: llc -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -mattr=-isel < %s | FileCheck %s --implicit-check-not isel
 
 @.str = private unnamed_addr constant [3 x i8] c"]]\00", align 1
 @.str.1 = private unnamed_addr constant [35 x i8] c"Index < Length && \22Invalid index!\22\00", align 1
@@ -23,6 +24,219 @@ target triple = "powerpc64le-unknown-linux-gnu"
 @.str.3 = private unnamed_addr constant [95 x i8] c"(data || length == 0) && \22StringRef cannot be built from a NULL argument with non-null length\22\00", align 1
 @__PRETTY_FUNCTION__._ZN4llvm9StringRefC2EPKcm = private unnamed_addr constant [49 x i8] c"llvm::StringRef::StringRef(const char *, size_t)\00", align 1
 define i64 @_Z3fn1N4llvm9StringRefE([2 x i64] %Str.coerce) {
+; CHECK-GEN-ISEL-TRUE-LABEL: _Z3fn1N4llvm9StringRefE:
+; CHECK-GEN-ISEL-TRUE:       # %bb.0: # %entry
+; CHECK-GEN-ISEL-TRUE-NEXT:    mflr r0
+; CHECK-GEN-ISEL-TRUE-NEXT:    stdu r1, -32(r1)
+; CHECK-GEN-ISEL-TRUE-NEXT:    std r0, 48(r1)
+; CHECK-GEN-ISEL-TRUE-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GEN-ISEL-TRUE-NEXT:    .cfi_offset lr, 16
+; CHECK-GEN-ISEL-TRUE-NEXT:    li r5, 2
+; CHECK-GEN-ISEL-TRUE-NEXT:    # implicit-def: $x6
+; CHECK-GEN-ISEL-TRUE-NEXT:    b .LBB0_3
+; CHECK-GEN-ISEL-TRUE-NEXT:    .p2align 4
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_1: # %_ZNK4llvm9StringRefixEm.exit
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmplwi r7, 93
+; CHECK-GEN-ISEL-TRUE-NEXT:    addi r7, r6, -1
+; CHECK-GEN-ISEL-TRUE-NEXT:    iseleq r6, r7, r6
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_2: # %_ZNK4llvm9StringRef6substrEmm.exit
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    addi r4, r4, -1
+; CHECK-GEN-ISEL-TRUE-NEXT:    addi r3, r3, 1
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_3: # %while.cond.outer
+; CHECK-GEN-ISEL-TRUE-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-GEN-ISEL-TRUE-NEXT:    # Child Loop BB0_5 Depth 2
+; CHECK-GEN-ISEL-TRUE-NEXT:    # Child Loop BB0_8 Depth 2
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmpldi r6, 0
+; CHECK-GEN-ISEL-TRUE-NEXT:    beq cr0, .LBB0_8
+; CHECK-GEN-ISEL-TRUE-NEXT:  # %bb.4: # %while.cond.preheader
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmpldi r4, 0
+; CHECK-GEN-ISEL-TRUE-NEXT:    beq- cr0, .LBB0_15
+; CHECK-GEN-ISEL-TRUE-NEXT:    .p2align 5
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_5: # %_ZNK4llvm9StringRefixEm.exit
+; CHECK-GEN-ISEL-TRUE-NEXT:    # Parent Loop BB0_3 Depth=1
+; CHECK-GEN-ISEL-TRUE-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-GEN-ISEL-TRUE-NEXT:    lbz r7, 0(r3)
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmplwi r7, 92
+; CHECK-GEN-ISEL-TRUE-NEXT:    bne cr0, .LBB0_1
+; CHECK-GEN-ISEL-TRUE-NEXT:  # %bb.6: # %if.then4
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmpldi r4, 2
+; CHECK-GEN-ISEL-TRUE-NEXT:    isellt r7, r4, r5
+; CHECK-GEN-ISEL-TRUE-NEXT:    add r3, r3, r7
+; CHECK-GEN-ISEL-TRUE-NEXT:    sub. r4, r4, r7
+; CHECK-GEN-ISEL-TRUE-NEXT:    bne+ cr0, .LBB0_5
+; CHECK-GEN-ISEL-TRUE-NEXT:    b .LBB0_15
+; CHECK-GEN-ISEL-TRUE-NEXT:    .p2align 5
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_7: # %if.then4.us
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    isellt r6, r4, r5
+; CHECK-GEN-ISEL-TRUE-NEXT:    add r3, r3, r6
+; CHECK-GEN-ISEL-TRUE-NEXT:    sub r4, r4, r6
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_8: # %while.cond.us
+; CHECK-GEN-ISEL-TRUE-NEXT:    # Parent Loop BB0_3 Depth=1
+; CHECK-GEN-ISEL-TRUE-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmpldi r4, 2
+; CHECK-GEN-ISEL-TRUE-NEXT:    bge cr0, .LBB0_10
+; CHECK-GEN-ISEL-TRUE-NEXT:  # %bb.9: # %if.end.us
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmpldi cr1, r4, 0
+; CHECK-GEN-ISEL-TRUE-NEXT:    bne+ cr1, .LBB0_11
+; CHECK-GEN-ISEL-TRUE-NEXT:    b .LBB0_15
+; CHECK-GEN-ISEL-TRUE-NEXT:    .p2align 5
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_10: # %if.end.i.i.us
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    lhz r6, 0(r3)
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmplwi cr1, r6, 23901
+; CHECK-GEN-ISEL-TRUE-NEXT:    beq cr1, .LBB0_14
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_11: # %_ZNK4llvm9StringRefixEm.exit.us
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    lbz r6, 0(r3)
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmplwi cr1, r6, 92
+; CHECK-GEN-ISEL-TRUE-NEXT:    beq cr1, .LBB0_7
+; CHECK-GEN-ISEL-TRUE-NEXT:  # %bb.12: # %_ZNK4llvm9StringRefixEm.exit.us
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    cmplwi r6, 93
+; CHECK-GEN-ISEL-TRUE-NEXT:    beq cr0, .LBB0_16
+; CHECK-GEN-ISEL-TRUE-NEXT:  # %bb.13: # %_ZNK4llvm9StringRef6substrEmm.exit.loopexit
+; CHECK-GEN-ISEL-TRUE-NEXT:    #
+; CHECK-GEN-ISEL-TRUE-NEXT:    li r6, 0
+; CHECK-GEN-ISEL-TRUE-NEXT:    b .LBB0_2
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_14: # %if.then
+; CHECK-GEN-ISEL-TRUE-NEXT:    addi r1, r1, 32
+; CHECK-GEN-ISEL-TRUE-NEXT:    ld r0, 16(r1)
+; CHECK-GEN-ISEL-TRUE-NEXT:    mtlr r0
+; CHECK-GEN-ISEL-TRUE-NEXT:    blr
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_15: # %cond.false.i
+; CHECK-GEN-ISEL-TRUE-NEXT:    addis r3, r2, .L__ModuleStringPool@toc@ha
+; CHECK-GEN-ISEL-TRUE-NEXT:    li r5, 225
+; CHECK-GEN-ISEL-TRUE-NEXT:    addi r4, r3, .L__ModuleStringPool@toc@l
+; CHECK-GEN-ISEL-TRUE-NEXT:    addi r3, r4, 53
+; CHECK-GEN-ISEL-TRUE-NEXT:    addi r6, r4, 88
+; CHECK-GEN-ISEL-TRUE-NEXT:    bl __assert_fail
+; CHECK-GEN-ISEL-TRUE-NEXT:    nop
+; CHECK-GEN-ISEL-TRUE-NEXT:  .LBB0_16: # %if.then9
+; CHECK-GEN-ISEL-TRUE-NEXT:    li r3, 1
+; CHECK-GEN-ISEL-TRUE-NEXT:    bl exit
+; CHECK-GEN-ISEL-TRUE-NEXT:    nop
+;
+; CHECK-LABEL: _Z3fn1N4llvm9StringRefE:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    # implicit-def: $x5
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: # %_ZNK4llvm9StringRef6substrEmm.exit
+; CHECK-NEXT:    #
+; CHECK-NEXT:    addi r4, r4, -1
+; CHECK-NEXT:    addi r3, r3, 1
+; CHECK-NEXT:  .LBB0_2: # %while.cond.outer
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB0_5 Depth 2
+; CHECK-NEXT:    # Child Loop BB0_9 Depth 2
+; CHECK-NEXT:    cmpldi r5, 0
+; CHECK-NEXT:    beq cr0, .LBB0_9
+; CHECK-NEXT:  # %bb.3: # %while.cond.preheader
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpldi r4, 0
+; CHECK-NEXT:    bne+ cr0, .LBB0_5
+; CHECK-NEXT:    b .LBB0_20
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB0_4: # %if.then4
+; CHECK-NEXT:    #
+; CHECK-NEXT:    add r3, r3, r6
+; CHECK-NEXT:    sub. r4, r4, r6
+; CHECK-NEXT:    beq- cr0, .LBB0_20
+; CHECK-NEXT:  .LBB0_5: # %_ZNK4llvm9StringRefixEm.exit
+; CHECK-NEXT:    # Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lbz r6, 0(r3)
+; CHECK-NEXT:    cmplwi r6, 92
+; CHECK-NEXT:    bne cr0, .LBB0_15
+; CHECK-NEXT:  # %bb.6: # %if.then4
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpldi r4, 2
+; CHECK-NEXT:    mr r6, r4
+; CHECK-NEXT:    blt cr0, .LBB0_4
+; CHECK-NEXT:  # %bb.7: # %if.then4
+; CHECK-NEXT:    #
+; CHECK-NEXT:    li r6, 2
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB0_8: # %if.then4.us
+; CHECK-NEXT:    #
+; CHECK-NEXT:    add r3, r3, r5
+; CHECK-NEXT:    sub r4, r4, r5
+; CHECK-NEXT:  .LBB0_9: # %while.cond.us
+; CHECK-NEXT:    # Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    cmpldi r4, 2
+; CHECK-NEXT:    bge cr0, .LBB0_11
+; CHECK-NEXT:  # %bb.10: # %if.end.us
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpldi cr1, r4, 0
+; CHECK-NEXT:    bne+ cr1, .LBB0_12
+; CHECK-NEXT:    b .LBB0_20
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB0_11: # %if.end.i.i.us
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lhz r5, 0(r3)
+; CHECK-NEXT:    cmplwi cr1, r5, 23901
+; CHECK-NEXT:    beq cr1, .LBB0_19
+; CHECK-NEXT:  .LBB0_12: # %_ZNK4llvm9StringRefixEm.exit.us
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lbz r5, 0(r3)
+; CHECK-NEXT:    cmplwi cr1, r5, 92
+; CHECK-NEXT:    bne cr1, .LBB0_17
+; CHECK-NEXT:  # %bb.13: # %if.then4.us
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr r5, r4
+; CHECK-NEXT:    bc 12, lt, .LBB0_8
+; CHECK-NEXT:  # %bb.14: # %if.then4.us
+; CHECK-NEXT:    #
+; CHECK-NEXT:    li r5, 2
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_15: # %_ZNK4llvm9StringRefixEm.exit
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplwi r6, 93
+; CHECK-NEXT:    bne cr0, .LBB0_1
+; CHECK-NEXT:  # %bb.16: # %if.end10
+; CHECK-NEXT:    #
+; CHECK-NEXT:    addi r5, r5, -1
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_17: # %_ZNK4llvm9StringRefixEm.exit.us
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplwi r5, 93
+; CHECK-NEXT:    beq cr0, .LBB0_21
+; CHECK-NEXT:  # %bb.18: # %_ZNK4llvm9StringRef6substrEmm.exit.loopexit
+; CHECK-NEXT:    #
+; CHECK-NEXT:    li r5, 0
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_19: # %if.then
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB0_20: # %cond.false.i
+; CHECK-NEXT:    addis r3, r2, .L__ModuleStringPool@toc@ha
+; CHECK-NEXT:    li r5, 225
+; CHECK-NEXT:    addi r4, r3, .L__ModuleStringPool@toc@l
+; CHECK-NEXT:    addi r3, r4, 53
+; CHECK-NEXT:    addi r6, r4, 88
+; CHECK-NEXT:    bl __assert_fail
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  .LBB0_21: # %if.then9
+; CHECK-NEXT:    li r3, 1
+; CHECK-NEXT:    bl exit
+; CHECK-NEXT:    nop
 entry:
   %Str.coerce.fca.0.extract = extractvalue [2 x i64] %Str.coerce, 0
   %Str.coerce.fca.1.extract = extractvalue [2 x i64] %Str.coerce, 1
@@ -130,16 +344,7 @@ _ZNK4llvm9StringRef6substrEmm.exit:
   %8 = ptrtoint ptr %add.ptr.i to i64
   br label %while.cond.outer
 
-; CHECK-LABEL: @_Z3fn1N4llvm9StringRefE
 ; Unecessary ISEL (all the registers are the same) is always removed
-; CHECK-GEN-ISEL-TRUE-NOT: iseleq [[SAME:r[0-9]+]], [[SAME]], [[SAME]]
-; CHECK-GEN-ISEL-TRUE: iseleq [[SAME:r[0-9]+]], {{r[0-9]+}}, [[SAME]]
-; CHECK: bc 12, eq, [[TRUE:.LBB[0-9]+]]
-; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NEXT: [[TRUE]]
-; CHECK-NEXT: # in Loop: Header
-; CHECK-NEXT: addi {{r[0-9]+}}, {{r[0-9]+}}, 0
-; CHECK-NEXT: [[SUCCESSOR]]
 }
 
 
diff --git a/llvm/test/CodeGen/PowerPC/expand-foldable-isel.ll b/llvm/test/CodeGen/PowerPC/expand-foldable-isel.ll
index 8da7519fa6dc7..5425996032e38 100644
--- a/llvm/test/CodeGen/PowerPC/expand-foldable-isel.ll
+++ b/llvm/test/CodeGen/PowerPC/expand-foldable-isel.ll
@@ -15,8 +15,8 @@ target triple = "powerpc64le-unknown-linux-gnu"
 ; After that we have:
 ;   updated: 416B   %vreg18<def> = ISEL8 %vreg5, %vreg5, %vreg15<undef>;
 
-; RUN: llc -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -ppc-gen-isel=true < %s | FileCheck %s --check-prefix=CHECK-GEN-ISEL-TRUE
-; RUN: llc -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck %s --implicit-check-not isel
+; RUN: llc -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -mattr=+isel < %s | FileCheck %s --check-prefix=CHECK-GEN-ISEL-TRUE
+; RUN: llc -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -mattr=-isel < %s | FileCheck %s --implicit-check-not isel
 %"struct.pov::ot_block_struct" = type { ptr, [3 x double], [3 x double], float, float, float, float, float, float, float, float, float, [3 x float], float, float, [3 x double], i16 }
 %"struct.pov::ot_node_struct" = type { %"struct.pov::ot_id_struct", ptr, [8 x ptr] }
 %"struct.pov::ot_id_struct" = type { i32, i32, i32, i32 }
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-1.mir b/llvm/test/CodeGen/PowerPC/expand-isel-1.mir
deleted file mode 100644
index 35e5398070528..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-1.mir
+++ /dev/null
@@ -1,57 +0,0 @@
-# This file tests the scenario: ISEL R0, ZERO, R0, CR
-# RUN: llc -ppc-gen-isel=false -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0' }
-  - { reg: '$x3' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x3
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r0 = ISEL $zero, $r0, $cr0gt
-    ; CHECK-LABEL: testExpandISEL
-    ; CHECK: BC $cr0gt, %[[TRUE:bb.[0-9]+]]
-    ; CHECK-NEXT: B %[[SUCCESSOR:bb.[0-9]+]]
-    ; CHECK: [[TRUE]]
-    ; CHECK: $r0 = ADDI $zero, 0
-
-    $x3 = EXTSW_32_64 $r0
-
-...
-
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-10.mir b/llvm/test/CodeGen/PowerPC/expand-isel-10.mir
deleted file mode 100644
index 6d51246336c22..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-10.mir
+++ /dev/null
@@ -1,54 +0,0 @@
-# This file tests the scenario: ISEL RX, RX, RX, CR  (X != 0),
-# which is redudant and removed.
-# RUN: llc -ppc-gen-isel=true -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x3' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x3
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r3 = ISEL $r3, $r3, $cr0gt
-    $x3 = EXTSW_32_64  $r3
-    ; CHECK: $r5 = ADDI $r3, 1
-    ; CHECK: $cr0 = CMPWI $r3, 0
-    ; CHECK-NOT: $r3 = ISEL $r3, $r3, $cr0gt
-    ; CHECK: $x3 = EXTSW_32_64 $r3
-
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-2.mir b/llvm/test/CodeGen/PowerPC/expand-isel-2.mir
deleted file mode 100644
index a4265e07f81eb..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-2.mir
+++ /dev/null
@@ -1,57 +0,0 @@
-# This file tests the scenario: ISEL RX, ZERO, RY, CR  (X != 0 && Y != 0)
-# RUN: llc -ppc-gen-isel=false -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0' }
-  - { reg: '$x3' }
-  - { reg: '$x4' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x3, $x4
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r3 = ISEL $zero, $r4, $cr0gt
-    ; CHECK: BC $cr0gt, %[[TRUE:bb.[0-9]+]]
-    ; CHECK: %[[FALSE:bb.[0-9]+]]
-    ; CHECK: $r3 = ORI $r4, 0
-    ; CHECK: B %[[SUCCESSOR:bb.[0-9]+]]
-    ; CHECK: [[TRUE]]
-    ; CHECK: $r3 = ADDI $zero, 0
-
-    $x3 = EXTSW_32_64 $r3
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-3.mir b/llvm/test/CodeGen/PowerPC/expand-isel-3.mir
deleted file mode 100644
index 28273602f91e6..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-3.mir
+++ /dev/null
@@ -1,58 +0,0 @@
-# This file tests the scenario: ISEL RX, RY, R0, CR  (X != 0 && Y != 0)
-# RUN: llc -ppc-gen-isel=false -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0' }
-  - { reg: '$x3' }
-  - { reg: '$x4' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x3, $x4
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r3 = ISEL $r4, $r0, $cr0gt
-    ; CHECK: BC $cr0gt, %[[TRUE:bb.[0-9]+]]
-    ; CHECK: %[[FALSE:bb.[0-9]+]]
-    ; CHECK: $r3 = ORI $r0, 0
-    ; CHECK: B %[[SUCCESSOR:bb.[0-9]+]]
-    ; CHECK: [[TRUE]]
-    ; CHECK: $r3 = ADDI  $r4, 0
-
-    $x3 = EXTSW_32_64  $r3
-
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-4.mir b/llvm/test/CodeGen/PowerPC/expand-isel-4.mir
deleted file mode 100644
index d4484f6d527c0..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-4.mir
+++ /dev/null
@@ -1,59 +0,0 @@
-# This file tests the scenario: ISEL R0, ZERO, RX, CR  (X != 0)
-# It also tests redundant liveins ($x7) and killed registers.
-# RUN: llc -ppc-gen-isel=false -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0' }
-  - { reg: '$x3' }
-  - { reg: '$x7' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x3, $x7
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r0 = ISEL killed $zero, killed $r5, killed $cr0gt, implicit killed $cr0
-    ; CHECK: BC killed $cr0gt, %[[TRUE:bb.[0-9]+]]
-    ; CHECK: %[[FALSE:bb.[0-9]+]]
-    ; CHECK: $r0 = ORI killed $r5, 0
-    ; CHECK: B %[[SUCCESSOR:bb.[0-9]+]]
-    ; CHECK: [[TRUE]]
-    ; CHECK: $r0 = ADDI killed $zero, 0
-
-    $x0 = EXTSW_32_64 killed $r0
-
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-5.mir b/llvm/test/CodeGen/PowerPC/expand-isel-5.mir
deleted file mode 100644
index 4142ef0fe89e4..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-5.mir
+++ /dev/null
@@ -1,54 +0,0 @@
-# This file tests the scenario: ISEL R0, RX, R0, CR  (X != 0)
-# RUN: llc -ppc-gen-isel=false -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0' }
-  - { reg: '$x3' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x3
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r0 = ISEL $r5, $r0, $cr0gt
-    ; CHECK: BC $cr0gt, %[[TRUE:bb.[0-9]+]]
-    ; CHECK: B %[[SUCCESSOR:bb.[0-9]+]]
-    ; CHECK: [[TRUE]]
-    ; CHECK: $r0 = ADDI $r5, 0
-    $x3 = EXTSW_32_64 $r0
-
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-6.mir b/llvm/test/CodeGen/PowerPC/expand-isel-6.mir
deleted file mode 100644
index 9ab511e695931..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-6.mir
+++ /dev/null
@@ -1,57 +0,0 @@
-# This file tests the scenario when ISEL is the last instruction of the last
-# Basic Block, i.e., the BB cannot fall through to its successor situation.
-# RUN: llc -ppc-gen-isel=false -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0' }
-  - { reg: '$x3' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x3
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r3 = ISEL $zero, $r0, $cr0gt
-    ; CHECK: BC $cr0gt, %[[TRUE:bb.[0-9]+]]
-    ; CHECK: %[[FALSE:bb.[0-9]+]]
-    ; CHECK: $r3 = ORI $r0, 0
-    ; CHECK: B %[[SUCCESSOR:bb.[0-9]+]]
-    ; CHECK: [[TRUE]]
-    ; CHECK: $r3 = ADDI $zero, 0
-
-
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-7.mir b/llvm/test/CodeGen/PowerPC/expand-isel-7.mir
deleted file mode 100644
index 64c2624700005..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-7.mir
+++ /dev/null
@@ -1,58 +0,0 @@
-# This file tests the scenario: ISEL RX, RY, RZ, CR  (X != 0 && Y != 0, Z != 0)
-# RUN: llc -ppc-gen-isel=false -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x3' }
-  - { reg: '$x4' }
-  - { reg: '$x5' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x3, $x4, $x5
-
-    $r4 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r5 = ISEL $r3, $r4, $cr0gt
-    ; CHECK: BC $cr0gt, %[[TRUE:bb.[0-9]+]]
-    ; CHECK: %[[FALSE:bb.[0-9]+]]
-    ; CHECK: $r5 = ORI $r4, 0
-    ; CHECK: B %[[SUCCESSOR:bb.[0-9]+]]
-    ; CHECK: [[TRUE]]
-    ; CHECK: $r5 = ADDI $r3, 0
-
-    $x5 = EXTSW_32_64 $r5
-
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-8.mir b/llvm/test/CodeGen/PowerPC/expand-isel-8.mir
deleted file mode 100644
index 1799676afee71..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-8.mir
+++ /dev/null
@@ -1,65 +0,0 @@
-# This file tests combining three consecutive ISELs scenario.
-# RUN: llc -ppc-gen-isel=false -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x3' }
-  - { reg: '$x4' }
-  - { reg: '$x5' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x3, $x4, $x5
-
-    $r4 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r5 = ISEL $r3, $r4, $cr0gt
-    $r3 = ISEL $r4, $r5, $cr0gt
-    $r4 = ISEL $r3, $r5, $cr0gt
-    ; CHECK: BC $cr0gt, %[[TRUE:bb.[0-9]+]]
-    ; CHECK: %[[FALSE:bb.[0-9]+]]
-    ; CHECK: $r5 = ORI $r4, 0
-    ; CHECK: $r3 = ORI $r5, 0
-    ; CHECK: $r4 = ORI $r5, 0
-    ; CHECK: B %[[SUCCESSOR:bb.[0-9]+]]
-    ; CHECK: [[TRUE]]
-    ; CHECK: $r5 = ADDI $r3, 0
-    ; CHECK: $r3 = ADDI $r4, 0
-    ; CHECK: $r4 = ADDI $r3, 0
-
-    $x5 = EXTSW_32_64 $r5
-    $x3 = EXTSW_32_64 $r3
-
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-9.mir b/llvm/test/CodeGen/PowerPC/expand-isel-9.mir
deleted file mode 100644
index 2f0cdca8496b0..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-9.mir
+++ /dev/null
@@ -1,54 +0,0 @@
-# This file tests the scenario: ISEL RX, RY, RY, CR  (X != 0 && Y != 0)
-# It is folded into a copy (%RX = OR %RY, %RY)
-# RUN: llc -ppc-gen-isel=true -run-pass ppc-expand-isel -o - %s | FileCheck %s
-
---- |
-  target datalayout = "E-m:e-i64:64-n32:64"
-  target triple = "powerpc64-unknown-linux-gnu"
-  define signext i32 @testExpandISEL(i32 signext %i, i32 signext %j) {
-  entry:
-    %cmp = icmp sgt i32 %i, 0
-    %add = add nsw i32 %i, 1
-    %cond = select i1 %cmp, i32 %add, i32 %j
-    ret i32 %cond
-  }
-
-...
----
-name:            testExpandISEL
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x3' }
-  - { reg: '$x4' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    liveins: $x3, $x4
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r3 = ISEL $r4, $r4, $cr0gt
-    ; Test fold ISEL to a copy
-    ; CHECK: $r3 = OR $r4, $r4
-
-    $x3 = EXTSW_32_64  $r3
-
-...
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel-liveness.mir b/llvm/test/CodeGen/PowerPC/expand-isel-liveness.mir
deleted file mode 100644
index 262e71d48fc09..0000000000000
--- a/llvm/test/CodeGen/PowerPC/expand-isel-liveness.mir
+++ /dev/null
@@ -1,80 +0,0 @@
-# RUN: llc -mtriple powerpc64-unknown-linux-gnu -run-pass=ppc-expand-isel -o \
-# RUN:   - %s -verify-machineinstrs | FileCheck %s
-
----
-name:           expand_isel_liveness1
-tracksRegLiveness: true
-registers:       []
-liveins:
-  - { reg: '$x3', virtual-reg: '' }
-  - { reg: '$x4', virtual-reg: '' }
-  - { reg: '$x5', virtual-reg: '' }
-  - { reg: '$x6', virtual-reg: '' }
-body:             |
-  bb.0:
-    liveins: $x3, $x4, $x5, $x6
-  
-    renamable $x8 = MULLD renamable $x5, renamable $x4
-    renamable $cr5 = CMPDI renamable $x3, 0
-    dead renamable $x9 = MULHDU_rec renamable $x3, renamable $x6, implicit-def $cr0
-    renamable $x3 = MULLD killed renamable $x3, renamable $x6
-    $cr1 = MCRF killed $cr0
-    renamable $x3 = ADD8 killed renamable $x3, killed renamable $x8
-    renamable $cr0 = CMPDI renamable $x5, 0
-    renamable $cr5lt = CRNOR killed renamable $cr0eq, killed renamable $cr5eq, implicit $cr5, implicit $cr0
-    renamable $cr0 = CMPLDI renamable $x3, 0
-    renamable $x8 = MULHDU renamable $x4, renamable $x6
-    renamable $x3 = ADD8 renamable $x8, killed renamable $x3
-    renamable $cr6 = CMPLD renamable $x3, killed renamable $x8
-    renamable $cr5gt = CRANDC killed renamable $cr6lt, killed renamable $cr0eq, implicit $cr0, implicit $cr6
-    renamable $cr5lt = CRORC killed renamable $cr5lt, killed renamable $cr1eq, implicit $cr1
-    renamable $x7 = LI8 1
-    dead renamable $x5 = MULHDU_rec killed renamable $x5, renamable $x4, implicit-def $cr0
-    renamable $cr5lt = CRORC killed renamable $cr5lt, killed renamable $cr0eq, implicit $cr0
-    renamable $cr5lt = CRNOR killed renamable $cr5lt, killed renamable $cr5gt
-    renamable $x4 = MULLD killed renamable $x4, killed renamable $x6
-    renamable $x5 = ISEL8 $zero8, killed renamable $x7, killed renamable $cr5lt
-    BLR8 implicit $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5
-
-  ; CHECK-LABEL: name: expand_isel_liveness1
-  ; CHECK: bb.1:
-  ; CHECK:   liveins: $x3, $x4, $x7
-  ; CHECK:   renamable $x5 = ORI8 killed renamable $x7, 0
-  ; CHECK:   B %bb.3
-  ; CHECK: bb.2:
-  ; CHECK:   liveins: $x3, $x4
-  ; CHECK:   renamable $x5 = ADDI8 $zero8, 0
-  ; CHECK: bb.3:
-  ; CHECK:   liveins: $x3, $x4, $x5
-  ; CHECK:   BLR8 implicit $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5
-...
-
----
-name:            expand_isel_liveness2
-tracksRegLiveness: true
-liveins:
-  - { reg: '$r0' }
-  - { reg: '$r3' }
-body:             |
-  bb.0.entry:
-    liveins: $r0, $r3
-
-    $r5 = ADDI $r3, 1
-    $cr0 = CMPWI $r3, 0
-    $r3 = ISEL $zero, killed $r0, killed $cr0gt
-
-  ; CHECK-LABEL: name: expand_isel_liveness2
-  ; CHECK: bb.0.entry:
-  ; CHECK:   liveins: $r0, $r3
-  ; CHECK:   $r5 = ADDI $r3, 1
-  ; CHECK:   $cr0 = CMPWI $r3, 0
-  ; CHECK:   BC killed $cr0gt, %bb.2
-  ; CHECK: bb.1.entry:
-  ; CHECK:   liveins: $r0
-  ; CHECK:   $r3 = ORI killed $r0, 0
-  ; CHECK:   B %bb.3
-  ; CHECK: bb.2.entry:
-  ; CHECK-NOT:   liveins: $zero
-  ; CHECK:   $r3 = ADDI $zero, 0
-...
-
diff --git a/llvm/test/CodeGen/PowerPC/expand-isel.ll b/llvm/test/CodeGen/PowerPC/expand-isel.ll
index cf403d6db14da..16e18b595da14 100644
--- a/llvm/test/CodeGen/PowerPC/expand-isel.ll
+++ b/llvm/test/CodeGen/PowerPC/expand-isel.ll
@@ -1,150 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
-; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck %s --implicit-check-not isel
+; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs -O2 -ppc-asm-full-reg-names -mcpu=pwr7 -mattr=-isel < %s | FileCheck %s --implicit-check-not isel
 
 define signext i32 @testExpandISELToIfElse(i32 signext %i, i32 signext %j) {
+; CHECK-LABEL: testExpandISELToIfElse:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpwi r3, 0
+; CHECK-NEXT:    ble cr0, .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    addi r4, r3, 1
+; CHECK-NEXT:  .LBB0_2: # %entry
+; CHECK-NEXT:    extsw r3, r4
+; CHECK-NEXT:    blr
 entry:
   %cmp = icmp sgt i32 %i, 0
   %add = add nsw i32 %i, 1
   %cond = select i1 %cmp, i32 %add, i32 %j
   ret i32 %cond
 
-; CHECK-LABEL: @testExpandISELToIfElse
-; CHECK: addi r5, r3, 1
-; CHECK-NEXT: cmpwi r3, 0
-; CHECK-NEXT: bc 12, gt, [[TRUE:.LBB[0-9]+]]
-; CHECK: ori r3, r4, 0
-; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NEXT:  [[TRUE]]
-; CHECK-NEXT: addi r3, r5, 0
-; CHECK-NEXT: [[SUCCESSOR]]
-; CHECK-NEXT: extsw r3, r3
-; CHECK-NEXT: blr
 }
 
-
 define signext i32 @testExpandISELToIf(i32 signext %i, i32 signext %j) {
+; CHECK-LABEL: testExpandISELToIf:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpwi r3, 0
+; CHECK-NEXT:    bgt cr0, .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    mr r4, r3
+; CHECK-NEXT:  .LBB1_2: # %entry
+; CHECK-NEXT:    mr r3, r4
+; CHECK-NEXT:    blr
 entry:
   %cmp = icmp sgt i32 %i, 0
   %cond = select i1 %cmp, i32 %j, i32 %i
   ret i32 %cond
 
-; CHECK-LABEL: @testExpandISELToIf
-; CHECK: cmpwi	 r3, 0
-; CHECK-NEXT: bc 12, gt, [[TRUE:.LBB[0-9]+]]
-; CHECK-NEXT: blr
-; CHECK-NEXT:  [[TRUE]]
-; CHECK-NEXT: addi r3, r4, 0
-; CHECK-NEXT: blr
 }
 
 define signext i32 @testExpandISELToElse(i32 signext %i, i32 signext %j) {
+; CHECK-LABEL: testExpandISELToElse:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpwi r3, 0
+; CHECK-NEXT:    bgtlr cr0
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    mr r3, r4
+; CHECK-NEXT:    blr
 entry:
   %cmp = icmp sgt i32 %i, 0
   %cond = select i1 %cmp, i32 %i, i32 %j
   ret i32 %cond
 
-; CHECK-LABEL: @testExpandISELToElse
-; CHECK: cmpwi	 r3, 0
-; CHECK-NEXT: bclr 12, gt, 0
-; CHECK: ori r3, r4, 0
-; CHECK-NEXT: blr
 }
 
-
 define signext i32 @testExpandISELToNull(i32 signext %i, i32 signext %j) {
+; CHECK-LABEL: testExpandISELToNull:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blr
 entry:
   %cmp = icmp sgt i32 %i, 0
   %cond = select i1 %cmp, i32 %i, i32 %i
   ret i32 %cond
 
-; CHECK-LABEL: @testExpandISELToNull
-; CHECK-NOT: b {{.LBB[0-9]+}}
-; CHECK-NOT: bc
-; CHECK: blr
 }
 
-define signext i32 @testExpandISELsTo2ORIs2ADDIs
-  (i32 signext %a, i32 signext %b, i32 signext %d,
-   i32 signext %f, i32 signext %g) {
+define signext i32 @testExpandISELsTo2ORIs2ADDIs(i32 signext %a, i32 signext %b, i32 signext %d, i32 signext %f, i32 signext %g) {
+; CHECK-LABEL: testExpandISELsTo2ORIs2ADDIs:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpwi r7, 0
+; CHECK-NEXT:    bgt cr0, .LBB4_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    mr r7, r4
+; CHECK-NEXT:  .LBB4_2: # %entry
+; CHECK-NEXT:    bgt cr0, .LBB4_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    mr r5, r6
+; CHECK-NEXT:  .LBB4_4: # %entry
+; CHECK-NEXT:    add r3, r7, r5
+; CHECK-NEXT:    extsw r3, r3
+; CHECK-NEXT:    blr
 entry:
-
   %cmp = icmp sgt i32 %g, 0
   %a.b = select i1 %cmp, i32 %g, i32 %b
   %d.f = select i1 %cmp, i32 %d, i32 %f
   %add = add nsw i32 %a.b, %d.f
   ret i32 %add
-
-; CHECK-LABEL: @testExpandISELsTo2ORIs2ADDIs
-; CHECK: cmpwi r7, 0
-; CHECK-NEXT: bc 12, gt, [[TRUE:.LBB[0-9]+]]
-; CHECK: ori r3, r4, 0
-; CHECK-NEXT: ori r4, r6, 0
-; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NEXT:  [[TRUE]]
-; CHECK-NEXT: addi r3, r7, 0
-; CHECK-NEXT: addi r4, r5, 0
-; CHECK-NEXT: [[SUCCESSOR]]
-; CHECK-NEXT: add r3, r3, r4
-; CHECK-NEXT: extsw r3, r3
-; CHECK-NEXT: blr
 }
 
-define signext i32 @testExpandISELsTo2ORIs1ADDI
-  (i32 signext %a, i32 signext %b, i32 signext %d,
-   i32 signext %f, i32 signext %g) {
+define signext i32 @testExpandISELsTo2ORIs1ADDI(i32 signext %a, i32 signext %b, i32 signext %d, i32 signext %f, i32 signext %g) {
+; CHECK-LABEL: testExpandISELsTo2ORIs1ADDI:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpwi r7, 0
+; CHECK-NEXT:    bgt cr0, .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    mr r3, r4
+; CHECK-NEXT:  .LBB5_2: # %entry
+; CHECK-NEXT:    bgt cr0, .LBB5_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    mr r5, r6
+; CHECK-NEXT:  .LBB5_4: # %entry
+; CHECK-NEXT:    add r3, r3, r5
+; CHECK-NEXT:    extsw r3, r3
+; CHECK-NEXT:    blr
 entry:
   %cmp = icmp sgt i32 %g, 0
   %a.b = select i1 %cmp, i32 %a, i32 %b
   %d.f = select i1 %cmp, i32 %d, i32 %f
   %add = add nsw i32 %a.b, %d.f
   ret i32 %add
-
-; CHECK-LABEL: @testExpandISELsTo2ORIs1ADDI
-; CHECK: cmpwi r7, 0
-; CHECK-NEXT: bc 12, gt, [[TRUE:.LBB[0-9]+]]
-; CHECK: ori r3, r4, 0
-; CHECK-NEXT: ori r4, r6, 0
-; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NEXT: [[TRUE]]
-; CHECK-NEXT: addi r4, r5, 0
-; CHECK-NEXT:  [[SUCCESSOR]]
-; CHECK-NEXT: add r3, r3, r4
-; CHECK-NEXT: extsw r3, r3
-; CHECK-NEXT: blr
 }
 
-define signext i32 @testExpandISELsTo1ORI1ADDI
-  (i32 signext %a, i32 signext %b, i32 signext %d,
-   i32 signext %f, i32 signext %g) {
+define signext i32 @testExpandISELsTo1ORI1ADDI(i32 signext %a, i32 signext %b, i32 signext %d, i32 signext %f, i32 signext %g) {
+; CHECK-LABEL: testExpandISELsTo1ORI1ADDI:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpwi r7, 0
+; CHECK-NEXT:    mr r7, r3
+; CHECK-NEXT:    bgt cr0, .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    mr r7, r4
+; CHECK-NEXT:  .LBB6_2: # %entry
+; CHECK-NEXT:    bgt cr0, .LBB6_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    mr r5, r6
+; CHECK-NEXT:  .LBB6_4: # %entry
+; CHECK-NEXT:    add r4, r7, r5
+; CHECK-NEXT:    add r3, r3, r4
+; CHECK-NEXT:    extsw r3, r3
+; CHECK-NEXT:    blr
 entry:
-
   %cmp = icmp sgt i32 %g, 0
   %a.b = select i1 %cmp, i32 %a, i32 %b
   %d.f = select i1 %cmp, i32 %d, i32 %f
   %add1 = add nsw i32 %a.b, %d.f
   %add2 = add nsw i32 %a, %add1
   ret i32 %add2
-
-; CHECK-LABEL: @testExpandISELsTo1ORI1ADDI
-; CHECK: cmpwi r7, 0
-; CHECK-NEXT: bc 12, gt, [[TRUE:.LBB[0-9]+]]
-; CHECK: ori r5, r6, 0
-; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NEXT: [[TRUE]]
-; CHECK-NEXT: addi r4, r3, 0
-; CHECK-NEXT:  [[SUCCESSOR]]
-; CHECK-NEXT: add r4, r4, r5
-; CHECK-NEXT: add r3, r3, r4
-; CHECK-NEXT: extsw r3, r3
-; CHECK-NEXT: blr
 }
 
-define signext i32 @testExpandISELsTo0ORI2ADDIs
-  (i32 signext %a, i32 signext %b, i32 signext %d,
-   i32 signext %f, i32 signext %g) {
+define signext i32 @testExpandISELsTo0ORI2ADDIs(i32 signext %a, i32 signext %b, i32 signext %d, i32 signext %f, i32 signext %g) {
+; CHECK-LABEL: testExpandISELsTo0ORI2ADDIs:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpwi r7, 0
+; CHECK-NEXT:    mr r7, r3
+; CHECK-NEXT:    bgt cr0, .LBB7_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    mr r7, r4
+; CHECK-NEXT:  .LBB7_2: # %entry
+; CHECK-NEXT:    mr r4, r5
+; CHECK-NEXT:    bgt cr0, .LBB7_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    mr r4, r6
+; CHECK-NEXT:  .LBB7_4: # %entry
+; CHECK-NEXT:    add r4, r7, r4
+; CHECK-NEXT:    add r3, r3, r4
+; CHECK-NEXT:    sub r3, r3, r5
+; CHECK-NEXT:    extsw r3, r3
+; CHECK-NEXT:    blr
 entry:
-
   %cmp = icmp sgt i32 %g, 0
   %a.b = select i1 %cmp, i32 %a, i32 %b
   %d.f = select i1 %cmp, i32 %d, i32 %f
@@ -152,27 +163,30 @@ entry:
   %add2 = add nsw i32 %a, %add1
   %sub1 = sub nsw i32 %add2, %d
   ret i32 %sub1
-
-; CHECK-LABEL: @testExpandISELsTo0ORI2ADDIs
-; CHECK: cmpwi r7, 0
-; CHECK-NEXT: bc 12, gt, [[TRUE:.LBB[0-9]+]]
-; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NEXT:  [[TRUE]]
-; CHECK-NEXT: addi r4, r3, 0
-; CHECK-NEXT: addi r6, r5, 0
-; CHECK-NEXT:  [[SUCCESSOR]]
-; CHECK-NEXT: add r4, r4, r6
-; CHECK-NEXT: add r3, r3, r4
-; CHECK-NEXT: sub r3, r3, r5
-; CHECK-NEXT: extsw r3, r3
-; CHECK-NEXT: blr
 }
 
-
 @b = local_unnamed_addr global i32 0, align 4
 @a = local_unnamed_addr global i32 0, align 4
 ; Function Attrs: norecurse nounwind readonly
 define signext i32 @testComplexISEL() #0 {
+; CHECK-LABEL: testComplexISEL:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis r3, r2, .LC0@toc@ha
+; CHECK-NEXT:    ld r3, .LC0@toc@l(r3)
+; CHECK-NEXT:    lwz r4, 0(r3)
+; CHECK-NEXT:    li r3, 1
+; CHECK-NEXT:    cmplwi r4, 0
+; CHECK-NEXT:    bnelr cr0
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    addis r3, r2, .LC1@toc@ha
+; CHECK-NEXT:    addis r4, r2, .LC2@toc@ha
+; CHECK-NEXT:    ld r3, .LC1@toc@l(r3)
+; CHECK-NEXT:    ld r4, .LC2@toc@l(r4)
+; CHECK-NEXT:    lwa r3, 0(r3)
+; CHECK-NEXT:    xor r3, r3, r4
+; CHECK-NEXT:    cntlzd r3, r3
+; CHECK-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-NEXT:    blr
 entry:
   %0 = load i32, ptr @b, align 4, !tbaa !1
   %tobool = icmp eq i32 %0, 0
@@ -190,13 +204,6 @@ cleanup:
   %retval.0 = phi i32 [ %conv3, %if.end ], [ 1, %entry ]
   ret i32 %retval.0
 
-; CHECK-LABEL: @testComplexISEL
-; CHECK: li r3, 1
-; CHECK: cmplwi r4, 0
-; CHECK: bnelr cr0
-; CHECK: xor [[XOR:r[0-9]+]]
-; CHECK: cntlzd [[CZ:r[0-9]+]], [[XOR]]
-; CHECK: rldicl [[SH:r[0-9]+]], [[CZ]], 58, 63
 }
 
 !1 = !{!2, !2, i64 0}
diff --git a/llvm/test/CodeGen/PowerPC/fold-zero.ll b/llvm/test/CodeGen/PowerPC/fold-zero.ll
index 6262d24040a3e..a071464ac6410 100644
--- a/llvm/test/CodeGen/PowerPC/fold-zero.ll
+++ b/llvm/test/CodeGen/PowerPC/fold-zero.ll
@@ -1,40 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-crbits | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck --check-prefix=CHECK-CRB %s
-; RUN: llc -verify-machineinstrs -ppc-gen-isel=false < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mattr=-isel < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck --check-prefix=CHECK-NO-ISEL %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 define i32 @test1(i1 %a, i32 %c) nounwind  {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi. 3, 3, 1
+; CHECK-NEXT:    iseleq 3, 0, 4
+; CHECK-NEXT:    blr
+;
+; CHECK-CRB-LABEL: test1:
+; CHECK-CRB:       # %bb.0:
+; CHECK-CRB-NEXT:    andi. 3, 3, 1
+; CHECK-CRB-NEXT:    li 3, 0
+; CHECK-CRB-NEXT:    iselgt 3, 4, 3
+; CHECK-CRB-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: test1:
+; CHECK-NO-ISEL:       # %bb.0:
+; CHECK-NO-ISEL-NEXT:    andi. 3, 3, 1
+; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB0_2
+; CHECK-NO-ISEL-NEXT:  # %bb.1:
+; CHECK-NO-ISEL-NEXT:    li 4, 0
+; CHECK-NO-ISEL-NEXT:  .LBB0_2:
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
+; CHECK-NO-ISEL-NEXT:    blr
   %x = select i1 %a, i32 %c, i32 0
   ret i32 %x
 
-; CHECK-LABEL: @test1
-; CHECK-NOT: li {{[0-9]+}}, 0
-; CHECK: iseleq 3, 0,
-; CHECK: blr
-; CHECK-NO-ISEL-LABEL: @test1
-; CHECK-NO-ISEL: li 3, 0
-; CHECK-NO-ISEL-NEXT: bc 12, 1, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL-NEXT: blr
-; CHECK-NO-ISEL-NEXT: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: addi 3, 4, 0
-; CHECK-NO-ISEL-NEXT: blr
 }
 
 define i32 @test2(i1 %a, i32 %c) nounwind  {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi. 3, 3, 1
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    iseleq 3, 4, 3
+; CHECK-NEXT:    blr
+;
+; CHECK-CRB-LABEL: test2:
+; CHECK-CRB:       # %bb.0:
+; CHECK-CRB-NEXT:    andi. 3, 3, 1
+; CHECK-CRB-NEXT:    iselgt 3, 0, 4
+; CHECK-CRB-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: test2:
+; CHECK-NO-ISEL:       # %bb.0:
+; CHECK-NO-ISEL-NEXT:    andi. 3, 3, 1
+; CHECK-NO-ISEL-NEXT:    li 3, 0
+; CHECK-NO-ISEL-NEXT:    bclr 12, 1, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1:
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
+; CHECK-NO-ISEL-NEXT:    blr
   %x = select i1 %a, i32 0, i32 %c
   ret i32 %x
 
-; CHECK-CRB-LABEL: @test2
-; CHECK-CRB-NOT: li {{[0-9]+}}, 0
-; CHECK-CRB: iselgt 3, 0,
-; CHECK-CRB: blr
-; CHECK-NO-ISEL-LABEL: @test2
-; CHECK-NO-ISEL: bc 12, 1, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 3, 4, 0
-; CHECK-NO-ISEL-NEXT: blr
-; CHECK-NO-ISEL-NEXT: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: li 3, 0
-; CHECK-NO-ISEL-NEXT: blr
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/i1-ext-fold.ll b/llvm/test/CodeGen/PowerPC/i1-ext-fold.ll
index 0a666860cbd76..a1be8d39994d5 100644
--- a/llvm/test/CodeGen/PowerPC/i1-ext-fold.ll
+++ b/llvm/test/CodeGen/PowerPC/i1-ext-fold.ll
@@ -1,32 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-isel < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind readnone
 define signext i32 @foo(i32 signext %a, i32 signext %b) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 3, 4
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    li 4, 16
+; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: foo:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    li 3, 16
+; CHECK-NO-ISEL-NEXT:    bclr 12, 0, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 0
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp = icmp slt i32 %a, %b
   %conv = zext i1 %cmp to i32
   %shl = shl nuw nsw i32 %conv, 4
   ret i32 %shl
 
-; CHECK-LABEL: @foo
-; CHECK-NO-ISEL-LABEL: @foo
-; CHECK-DAG: cmpw
-; CHECK-DAG: li [[REG1:[0-9]+]], 0
-; CHECK-DAG: li [[REG2:[0-9]+]], 16
-; CHECK: isellt 3, [[REG2]], [[REG1]]
-; CHECK: blr
 
-; CHECK-NO-ISEL: bc 12, 0,
-; CHECK-NO-ISEL: blr
-; CHECK-NO-ISEL: addi 3, 4, 0
-; CHECK-NO-ISEL-NEXT: blr
 }
 
 ; Function Attrs: nounwind readnone
 define signext i32 @foo2(i32 signext %a, i32 signext %b) #0 {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 3, 4
+; CHECK-NEXT:    li 3, 5
+; CHECK-NEXT:    li 4, 21
+; CHECK-NEXT:    isellt 3, 4, 3
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: foo2:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    li 3, 21
+; CHECK-NO-ISEL-NEXT:    bclr 12, 0, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 5
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp = icmp slt i32 %a, %b
   %conv = zext i1 %cmp to i32
@@ -34,40 +56,33 @@ entry:
   %add1 = or i32 %shl, 5
   ret i32 %add1
 
-; CHECK-LABEL: @foo2
-; CHECK-NO-ISEL-LABEL: @foo2
-; CHECK-DAG: cmpw
-; CHECK-DAG: li [[REG1:[0-9]+]], 5
-; CHECK-DAG: li [[REG2:[0-9]+]], 21
-; CHECK: isellt 3, [[REG2]], [[REG1]]
-; CHECK: blr
 
-; CHECK-NO-ISEL: bc 12, 0,
-; CHECK-NO-ISEL: blr
-; CHECK-NO-ISEL: addi 3, 4, 0
-; CHECK-NO-ISEL-NEXT: blr
 }
 
 ; Function Attrs: nounwind readnone
 define signext i32 @foo3(i32 signext %a, i32 signext %b) #0 {
+; CHECK-LABEL: foo3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 3, 4
+; CHECK-NEXT:    li 3, 16
+; CHECK-NEXT:    iselgt 3, 0, 3
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: foo3:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    li 3, 0
+; CHECK-NO-ISEL-NEXT:    bclr 12, 1, 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 16
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
   %shl = shl nuw nsw i32 %conv, 4
   ret i32 %shl
 
-; CHECK-LABEL: @foo3
-; CHECK-NO-ISEL-LABEL: @foo3
-; CHECK-DAG: cmpw
-; CHECK-DAG: li [[REG1:[0-9]+]], 16
-; CHECK: iselgt 3, 0, [[REG1]]
-; CHECK: blr
 
-; CHECK-NO-ISEL: bc 12, 1, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL-NEXT: blr
-; CHECK-NO-ISEL-NEXT: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: li 3, 0
-; CHECK-NO-ISEL-NEXT: blr
 }
 
 attributes #0 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/PowerPC/i64_fp_round.ll b/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
index 340d9aff8f85b..f7df003fcc3f8 100644
--- a/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
+++ b/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -1,37 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; Verify that we get the code sequence needed to avoid double-rounding.
+; Note that only parts of the sequence are checked for here, to allow
+; for minor code generation differences.
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -ppc-gen-isel=false < %s | FileCheck %s --check-prefix=CHECK-NO-ISEL
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -mattr=-isel < %s | FileCheck %s --check-prefix=CHECK-NO-ISEL
+; Also check that with -enable-unsafe-fp-math we do not get that extra
+; code sequence.  Simply verify that there is no "isel" present.
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK-UNSAFE
+; CHECK-UNSAFE-NOT: isel
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 define float @test(i64 %x) nounwind readnone {
-entry:
-  %conv = sitofp i64 %x to float
-  ret float %conv
-}
-
 ; Verify that we get the code sequence needed to avoid double-rounding.
 ; Note that only parts of the sequence are checked for here, to allow
 ; for minor code generation differences.
-
-;CHECK-LABEL: test
-;CHECK-NO-ISEL-LABEL: test
-; CHECK: sradi [[REG1:[0-9]+]], 3, 53
-; CHECK: addi [[REG2:[0-9]+]], [[REG1]], 1
-; CHECK: cmpldi [[REG2]], 1
-; CHECK: iselgt [[REG3:[0-9]+]], {{[0-9]+}}, 3
-; CHECK-NO-ISEL: rldicr [[REG2:[0-9]+]], {{[0-9]+}}, 0, 52
-; CHECK-NO-ISEL: bc 12, 1, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NO-ISEL-NEXT: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: addi {{[0-9]+}}, [[REG2]], 0
-; CHECK-NO-ISEL-NEXT: [[SUCCESSOR]]
-; CHECK-NO-ISEL: std {{[0-9]+}}, -{{[0-9]+}}(1)
-; CHECK: std [[REG3]], -{{[0-9]+}}(1)
-
-
 ; Also check that with -enable-unsafe-fp-math we do not get that extra
 ; code sequence.  Simply verify that there is no "isel" present.
-
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK-UNSAFE
-; CHECK-UNSAFE-NOT: isel
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clrldi 4, 3, 53
+; CHECK-NEXT:    sradi 5, 3, 53
+; CHECK-NEXT:    addi 4, 4, 2047
+; CHECK-NEXT:    addi 5, 5, 1
+; CHECK-NEXT:    or 4, 4, 3
+; CHECK-NEXT:    cmpldi 5, 1
+; CHECK-NEXT:    rldicr 4, 4, 0, 52
+; CHECK-NEXT:    iselgt 3, 4, 3
+; CHECK-NEXT:    std 3, -8(1)
+; CHECK-NEXT:    lfd 0, -8(1)
+; CHECK-NEXT:    xscvsxddp 0, 0
+; CHECK-NEXT:    frsp 1, 0
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: test:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    sradi 4, 3, 53
+; CHECK-NO-ISEL-NEXT:    addi 4, 4, 1
+; CHECK-NO-ISEL-NEXT:    cmpldi 4, 1
+; CHECK-NO-ISEL-NEXT:    bc 4, 1, .LBB0_2
+; CHECK-NO-ISEL-NEXT:  # %bb.1:
+; CHECK-NO-ISEL-NEXT:    clrldi 4, 3, 53
+; CHECK-NO-ISEL-NEXT:    addi 4, 4, 2047
+; CHECK-NO-ISEL-NEXT:    or 3, 4, 3
+; CHECK-NO-ISEL-NEXT:    rldicr 3, 3, 0, 52
+; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %entry
+; CHECK-NO-ISEL-NEXT:    std 3, -8(1)
+; CHECK-NO-ISEL-NEXT:    lfd 0, -8(1)
+; CHECK-NO-ISEL-NEXT:    xscvsxddp 0, 0
+; CHECK-NO-ISEL-NEXT:    frsp 1, 0
+; CHECK-NO-ISEL-NEXT:    blr
+;
+; CHECK-UNSAFE-LABEL: test:
+; CHECK-UNSAFE:       # %bb.0: # %entry
+; CHECK-UNSAFE-NEXT:    std 3, -8(1)
+; CHECK-UNSAFE-NEXT:    lfd 0, -8(1)
+; CHECK-UNSAFE-NEXT:    xscvsxddp 0, 0
+; CHECK-UNSAFE-NEXT:    frsp 1, 0
+; CHECK-UNSAFE-NEXT:    blr
+
+entry:
+  %conv = sitofp i64 %x to float
+  ret float %conv
+}
+
 
diff --git a/llvm/test/CodeGen/PowerPC/ifcvt.ll b/llvm/test/CodeGen/PowerPC/ifcvt.ll
index f04deb37a5755..6b9d872f4aad7 100644
--- a/llvm/test/CodeGen/PowerPC/ifcvt.ll
+++ b/llvm/test/CodeGen/PowerPC/ifcvt.ll
@@ -1,9 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -verify-machineinstrs -ppc-gen-isel=false | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -verify-machineinstrs -mattr=-isel | FileCheck --check-prefix=CHECK-NO-ISEL %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 define i32 @test(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    slwi 5, 6, 16
+; CHECK-NEXT:    extsh 6, 6
+; CHECK-NEXT:    cmpwi 5, -1
+; CHECK-NEXT:    add 5, 6, 3
+; CHECK-NEXT:    clrlwi 6, 6, 17
+; CHECK-NEXT:    sub 6, 3, 6
+; CHECK-NEXT:    sub 3, 4, 3
+; CHECK-NEXT:    iselgt 5, 5, 6
+; CHECK-NEXT:    extsh 5, 5
+; CHECK-NEXT:    add 3, 3, 5
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: test:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    slwi 7, 6, 16
+; CHECK-NO-ISEL-NEXT:    extsh 5, 6
+; CHECK-NO-ISEL-NEXT:    cmpwi 7, -1
+; CHECK-NO-ISEL-NEXT:    ble 0, .LBB0_2
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %cond.false
+; CHECK-NO-ISEL-NEXT:    add 5, 5, 3
+; CHECK-NO-ISEL-NEXT:    b .LBB0_3
+; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %cond.true
+; CHECK-NO-ISEL-NEXT:    clrlwi 5, 5, 17
+; CHECK-NO-ISEL-NEXT:    sub 5, 3, 5
+; CHECK-NO-ISEL-NEXT:  .LBB0_3: # %cond.end
+; CHECK-NO-ISEL-NEXT:    extsh 5, 5
+; CHECK-NO-ISEL-NEXT:    sub 3, 4, 3
+; CHECK-NO-ISEL-NEXT:    add 3, 3, 5
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %sext82 = shl i32 %d, 16
   %conv29 = ashr exact i32 %sext82, 16
@@ -19,18 +51,6 @@ cond.false:                                       ; preds = %sw.epilog
   %add37 = add nsw i32 %conv29, %a
   br label %cond.end
 
-; CHECK-LABEL: @test
-; CHECK-NO-ISEL-LABEL: @test
-; CHECK: add [[REG:[0-9]+]], 
-; CHECK: sub [[REG2:[0-9]+]],
-; CHECK: iselgt {{[0-9]+}}, [[REG]], [[REG2]]
-; CHECK-NO-ISEL: bc 12, 1, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 5, 6, 0
-; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NO-ISEL: [[TRUE]]
-; CHECK-NO-ISEL: extsh 5, 5
-; CHECK-NO-ISEL-NEXT: add 3, 3, 5
-; CHECK-NO-ISEL-NEXT: blr
 
 cond.end:                                         ; preds = %cond.false, %cond.true
   %cond = phi i32 [ %sub34, %cond.true ], [ %add37, %cond.false ]
diff --git a/llvm/test/CodeGen/PowerPC/isel.ll b/llvm/test/CodeGen/PowerPC/isel.ll
index c1cceb9670180..5f64df5dfeb91 100644
--- a/llvm/test/CodeGen/PowerPC/isel.ll
+++ b/llvm/test/CodeGen/PowerPC/isel.ll
@@ -1,38 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 ; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-isel < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
 
 define i64 @test1(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpld 3, 4
+; CHECK-NEXT:    isellt 3, 6, 5
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: test1:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    cmpld 3, 4
+; CHECK-NO-ISEL-NEXT:    bge 0, .LBB0_2
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 5, 6
+; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 5
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
 	%p = icmp uge i64 %a, %b
 	%x = select i1 %p, i64 %c, i64 %d
 	ret i64 %x
-; CHECK-LABEL: @test1
-; CHECK-NO-ISEL-LABEL: @test1
-; CHECK: isel
-; CHECK-NO-ISEL: bc 12, 0, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 3, 5, 0
-; CHECK-NO-ISEL-NEXT: blr
-; CHECK-NO-ISEL: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: addi 3, 6, 0
-; CHECK-NO-ISEL-NEXT: blr
 }
 
 define i32 @test2(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmplw 3, 4
+; CHECK-NEXT:    isellt 3, 6, 5
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: test2:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    cmplw 3, 4
+; CHECK-NO-ISEL-NEXT:    bge 0, .LBB1_2
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 5, 6
+; CHECK-NO-ISEL-NEXT:  .LBB1_2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 5
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
 	%p = icmp uge i32 %a, %b
 	%x = select i1 %p, i32 %c, i32 %d
 	ret i32 %x
-; CHECK-LABEL: @test2
-; CHECK-NO-ISEL-LABEL: @test2
-; CHECK: isel
-; CHECK-NO-ISEL: bc 12, 0, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 3, 5, 0
-; CHECK-NO-ISEL-NEXT: blr
-; CHECK-NO-ISEL: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: addi 3, 6, 0
-; CHECK-NO-ISEL-NEXT: blr
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/optcmp.ll b/llvm/test/CodeGen/PowerPC/optcmp.ll
index bc265c646d471..831bc97cc0e9f 100644
--- a/llvm/test/CodeGen/PowerPC/optcmp.ll
+++ b/llvm/test/CodeGen/PowerPC/optcmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -mattr=-crbits -disable-ppc-cmp-opt=0 | FileCheck %s
-; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -mattr=-crbits -disable-ppc-cmp-opt=0 -ppc-gen-isel=false | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -mattr=-crbits -disable-ppc-cmp-opt=0 -mattr=-isel | FileCheck --check-prefix=CHECK-NO-ISEL %s
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
@@ -16,14 +16,12 @@ define signext i32 @foo(i32 signext %a, i32 signext %b, ptr nocapture %c) #0 {
 ;
 ; CHECK-NO-ISEL-LABEL: foo:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
-; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
 ; CHECK-NO-ISEL-NEXT:    sub 6, 3, 4
-; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB0_2
-; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 4, 0
-; CHECK-NO-ISEL-NEXT:    b .LBB0_2
-; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %entry
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
 ; CHECK-NO-ISEL-NEXT:    stw 6, 0(5)
+; CHECK-NO-ISEL-NEXT:    bgtlr 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %sub = sub nsw i32 %a, %b
@@ -46,14 +44,14 @@ define signext i32 @foo2(i32 signext %a, i32 signext %b, ptr nocapture %c) #0 {
 ;
 ; CHECK-NO-ISEL-LABEL: foo2:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
-; CHECK-NO-ISEL-NEXT:    slw 4, 3, 4
-; CHECK-NO-ISEL-NEXT:    li 6, 0
+; CHECK-NO-ISEL-NEXT:    mr 6, 3
 ; CHECK-NO-ISEL-NEXT:    li 3, 1
+; CHECK-NO-ISEL-NEXT:    slw 4, 6, 4
 ; CHECK-NO-ISEL-NEXT:    cmpwi 4, 0
 ; CHECK-NO-ISEL-NEXT:    stw 4, 0(5)
-; CHECK-NO-ISEL-NEXT:    bclr 12, 1, 0
+; CHECK-NO-ISEL-NEXT:    bgtlr 0
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 6, 0
+; CHECK-NO-ISEL-NEXT:    li 3, 0
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %shl = shl i32 %a, %b
@@ -74,12 +72,10 @@ define i64 @fool(i64 %a, i64 %b, ptr nocapture %c) #0 {
 ; CHECK-NO-ISEL-LABEL: fool:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    sub. 6, 3, 4
-; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB2_2
-; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 4, 0
-; CHECK-NO-ISEL-NEXT:    b .LBB2_2
-; CHECK-NO-ISEL-NEXT:  .LBB2_2: # %entry
 ; CHECK-NO-ISEL-NEXT:    std 6, 0(5)
+; CHECK-NO-ISEL-NEXT:    bgtlr 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %sub = sub nsw i64 %a, %b
@@ -100,12 +96,10 @@ define i64 @foolb(i64 %a, i64 %b, ptr nocapture %c) #0 {
 ; CHECK-NO-ISEL-LABEL: foolb:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    sub. 6, 3, 4
-; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB3_1
-; CHECK-NO-ISEL-NEXT:    b .LBB3_2
-; CHECK-NO-ISEL-NEXT:  .LBB3_1: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 4, 0
-; CHECK-NO-ISEL-NEXT:  .LBB3_2: # %entry
 ; CHECK-NO-ISEL-NEXT:    std 6, 0(5)
+; CHECK-NO-ISEL-NEXT:    blelr 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %sub = sub nsw i64 %a, %b
@@ -126,12 +120,10 @@ define i64 @foolc(i64 %a, i64 %b, ptr nocapture %c) #0 {
 ; CHECK-NO-ISEL-LABEL: foolc:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    sub. 6, 4, 3
-; CHECK-NO-ISEL-NEXT:    bc 12, 0, .LBB4_2
-; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 4, 0
-; CHECK-NO-ISEL-NEXT:    b .LBB4_2
-; CHECK-NO-ISEL-NEXT:  .LBB4_2: # %entry
 ; CHECK-NO-ISEL-NEXT:    std 6, 0(5)
+; CHECK-NO-ISEL-NEXT:    bltlr 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %sub = sub nsw i64 %b, %a
@@ -152,12 +144,10 @@ define i64 @foold(i64 %a, i64 %b, ptr nocapture %c) #0 {
 ; CHECK-NO-ISEL-LABEL: foold:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    sub. 6, 4, 3
-; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB5_2
-; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 4, 0
-; CHECK-NO-ISEL-NEXT:    b .LBB5_2
-; CHECK-NO-ISEL-NEXT:  .LBB5_2: # %entry
 ; CHECK-NO-ISEL-NEXT:    std 6, 0(5)
+; CHECK-NO-ISEL-NEXT:    bgtlr 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %sub = sub nsw i64 %b, %a
@@ -178,12 +168,10 @@ define i64 @foold2(i64 %a, i64 %b, ptr nocapture %c) #0 {
 ; CHECK-NO-ISEL-LABEL: foold2:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    sub. 6, 3, 4
-; CHECK-NO-ISEL-NEXT:    bc 12, 0, .LBB6_2
-; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 4, 0
-; CHECK-NO-ISEL-NEXT:    b .LBB6_2
-; CHECK-NO-ISEL-NEXT:  .LBB6_2: # %entry
 ; CHECK-NO-ISEL-NEXT:    std 6, 0(5)
+; CHECK-NO-ISEL-NEXT:    bltlr 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %sub = sub nsw i64 %a, %b
@@ -336,12 +324,10 @@ define signext i64 @fooct(i64 signext %a, i64 signext %b, ptr nocapture %c) #0 {
 ; CHECK-NO-ISEL-NEXT:    and 6, 6, 7
 ; CHECK-NO-ISEL-NEXT:    mulld 6, 6, 9
 ; CHECK-NO-ISEL-NEXT:    rldicl. 6, 6, 8, 56
-; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB10_2
-; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 4, 0
-; CHECK-NO-ISEL-NEXT:    b .LBB10_2
-; CHECK-NO-ISEL-NEXT:  .LBB10_2: # %entry
 ; CHECK-NO-ISEL-NEXT:    std 6, 0(5)
+; CHECK-NO-ISEL-NEXT:    bgtlr 0
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 4
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %sub = sub nsw i64 %a, %b
diff --git a/llvm/test/CodeGen/PowerPC/p8-isel-sched.ll b/llvm/test/CodeGen/PowerPC/p8-isel-sched.ll
index 7e2515ff70938..cde5870db3940 100644
--- a/llvm/test/CodeGen/PowerPC/p8-isel-sched.ll
+++ b/llvm/test/CodeGen/PowerPC/p8-isel-sched.ll
@@ -1,10 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mattr=-isel < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind
 define void @foo(ptr nocapture %r1, ptr nocapture %r2, ptr nocapture %r3, ptr nocapture %r4, i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d) #0 {
+; Make sure that we don't schedule all of the isels together, they should be
+; intermixed with the adds because each isel starts a new dispatch group.
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmplwi 7, 0
+; CHECK-NEXT:    addi 7, 8, 1
+; CHECK-NEXT:    iseleq 9, 9, 8
+; CHECK-NEXT:    stw 9, 0(3)
+; CHECK-NEXT:    addi 3, 10, -2
+; CHECK-NEXT:    iseleq 9, 10, 8
+; CHECK-NEXT:    iseleq 3, 3, 7
+; CHECK-NEXT:    stw 9, 0(4)
+; CHECK-NEXT:    addi 4, 10, -5
+; CHECK-NEXT:    stw 3, 0(5)
+; CHECK-NEXT:    addi 3, 8, 3
+; CHECK-NEXT:    iseleq 3, 4, 3
+; CHECK-NEXT:    stw 3, 0(6)
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: foo:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    cmplwi 7, 0
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    bne 0, .LBB0_2
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 9
+; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %entry
+; CHECK-NO-ISEL-NEXT:    stw 7, 0(3)
+; CHECK-NO-ISEL-NEXT:    mr 3, 8
+; CHECK-NO-ISEL-NEXT:    bne 0, .LBB0_4
+; CHECK-NO-ISEL-NEXT:  # %bb.3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 10
+; CHECK-NO-ISEL-NEXT:  .LBB0_4: # %entry
+; CHECK-NO-ISEL-NEXT:    stw 3, 0(4)
+; CHECK-NO-ISEL-NEXT:    bne 0, .LBB0_7
+; CHECK-NO-ISEL-NEXT:  # %bb.5: # %entry
+; CHECK-NO-ISEL-NEXT:    addi 3, 10, -2
+; CHECK-NO-ISEL-NEXT:    stw 3, 0(5)
+; CHECK-NO-ISEL-NEXT:    beq 0, .LBB0_8
+; CHECK-NO-ISEL-NEXT:  .LBB0_6:
+; CHECK-NO-ISEL-NEXT:    addi 3, 8, 3
+; CHECK-NO-ISEL-NEXT:    stw 3, 0(6)
+; CHECK-NO-ISEL-NEXT:    blr
+; CHECK-NO-ISEL-NEXT:  .LBB0_7:
+; CHECK-NO-ISEL-NEXT:    addi 3, 8, 1
+; CHECK-NO-ISEL-NEXT:    stw 3, 0(5)
+; CHECK-NO-ISEL-NEXT:    bne 0, .LBB0_6
+; CHECK-NO-ISEL-NEXT:  .LBB0_8: # %entry
+; CHECK-NO-ISEL-NEXT:    addi 3, 10, -5
+; CHECK-NO-ISEL-NEXT:    stw 3, 0(6)
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %tobool = icmp ne i32 %a, 0
   %cond = select i1 %tobool, i32 %b, i32 %c
@@ -22,21 +74,4 @@ entry:
   ret void
 }
 
-; Make sure that we don't schedule all of the isels together, they should be
-; intermixed with the adds because each isel starts a new dispatch group.
-; CHECK-LABEL: @foo
-; CHECK-NO-ISEL-LABEL: @foo
-; CHECK: isel
-; CHECK-NO-ISEL: bc 12, 2, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NO-ISEL: [[TRUE]]
-; CHECK-NO-ISEL: addi {{[0-9]+}}, {{[0-9]+}}, -2
-; CHECK: addi
-; CHECK: isel
-; CHECK-NO-ISEL: bc 12, 2, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 3, 7, 0
-; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NO-ISEL: [[TRUE]]
-; CHECK: blr
-
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/ppc-crbits-onoff.ll b/llvm/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
index 3d8dcbd00d01a..49a5687afe4c8 100644
--- a/llvm/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-crbits-onoff.ll
@@ -1,10 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs -mcpu=pwr7 -mattr=-isel < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind readnone
 define signext i32 @crbitsoff(i32 signext %v1, i32 signext %v2) #0 {
+; CHECK-LABEL: crbitsoff:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw 4, 4
+; CHECK-NEXT:    cmplwi 3, 0
+; CHECK-NEXT:    li 3, 1
+; CHECK-NEXT:    iseleq 3, 0, 3
+; CHECK-NEXT:    rlwinm 4, 4, 27, 5, 31
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: crbitsoff:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    cmplwi 3, 0
+; CHECK-NO-ISEL-NEXT:    li 3, 1
+; CHECK-NO-ISEL-NEXT:    bne 0, .LBB0_2
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
+; CHECK-NO-ISEL-NEXT:    li 3, 0
+; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %entry
+; CHECK-NO-ISEL-NEXT:    cntlzw 4, 4
+; CHECK-NO-ISEL-NEXT:    rlwinm 4, 4, 27, 5, 31
+; CHECK-NO-ISEL-NEXT:    and 3, 3, 4
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %tobool = icmp ne i32 %v1, 0
   %lnot = icmp eq i32 %v2, 0
@@ -12,21 +35,28 @@ entry:
   %and = zext i1 %and3 to i32
   ret i32 %and
 
-; CHECK-LABEL: @crbitsoff
-; CHECK-NO-ISEL-LABEL: @crbitsoff
-; CHECK-DAG: cmplwi 3, 0
-; CHECK-DAG: li [[REG2:[0-9]+]], 1
-; CHECK-DAG: cntlzw [[REG3:[0-9]+]],
-; CHECK: iseleq [[REG4:[0-9]+]], 0, [[REG2]]
-; CHECK-NO-ISEL: bc 12, 2, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NO-ISEL: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: li 3, 0
-; CHECK: and 3, [[REG4]], [[REG3]]
-; CHECK: blr
 }
 
 define signext i32 @crbitson(i32 signext %v1, i32 signext %v2) #1 {
+; CHECK-LABEL: crbitson:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    cntlzw 4, 4
+; CHECK-NEXT:    srwi 3, 3, 5
+; CHECK-NEXT:    srwi 4, 4, 5
+; CHECK-NEXT:    xori 3, 3, 1
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    blr
+;
+; CHECK-NO-ISEL-LABEL: crbitson:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    cntlzw 3, 3
+; CHECK-NO-ISEL-NEXT:    cntlzw 4, 4
+; CHECK-NO-ISEL-NEXT:    srwi 3, 3, 5
+; CHECK-NO-ISEL-NEXT:    srwi 4, 4, 5
+; CHECK-NO-ISEL-NEXT:    xori 3, 3, 1
+; CHECK-NO-ISEL-NEXT:    and 3, 3, 4
+; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %tobool = icmp ne i32 %v1, 0
   %lnot = icmp eq i32 %v2, 0
@@ -34,15 +64,6 @@ entry:
   %and = zext i1 %and3 to i32
   ret i32 %and
 
-; CHECK-LABEL: @crbitson
-; CHECK-NO-ISEL-LABEL: @crbitson
-; CHECK-DAG: cntlzw [[REG1:[0-9]+]], 3
-; CHECK-DAG: cntlzw [[REG2:[0-9]+]], 4
-; CHECK: srwi [[REG3:[0-9]+]], [[REG1]], 5
-; CHECK: srwi [[REG4:[0-9]+]], [[REG2]], 5
-; CHECK: xori [[REG5:[0-9]+]], [[REG3]], 1
-; CHECK: and 3, [[REG5]], [[REG4]]
-; CHECK-NEXT: blr
 }
 
 
diff --git a/llvm/test/CodeGen/PowerPC/remove-implicit-use.mir b/llvm/test/CodeGen/PowerPC/remove-implicit-use.mir
index 28faace491173..f5b931e1e4238 100644
--- a/llvm/test/CodeGen/PowerPC/remove-implicit-use.mir
+++ b/llvm/test/CodeGen/PowerPC/remove-implicit-use.mir
@@ -1,5 +1,5 @@
 # RUN: llc -mtriple=powerpc64le-unknown-unknown -start-after=ppc-mi-peepholes \ 
-# RUN: -stop-before=ppc-expand-isel -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: -stop-after=ppc-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s
 --- |
   ; ModuleID = 'a.ll'
   source_filename = "a.c"   
diff --git a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll
index d5e77a5cda067..ebf4cbcaac94f 100644
--- a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll
+++ b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -ppc-reduce-cr-logicals -verify-machineinstrs -tail-dup-placement=false < %s | FileCheck %s
 ; RUN: llc -ppc-reduce-cr-logicals -verify-machineinstrs \
-; RUN:   -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN:   -mattr=-isel < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -21,14 +21,16 @@ define signext i32 @testi32slt(i32 signext %c1, i32 signext %c2, i32 signext %c3
 ; CHECK-NO-ISEL-LABEL: testi32slt:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpw 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpw 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crandc 20, 6, 2
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB0_2
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB0_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB0_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:  .LBB0_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -51,14 +53,16 @@ define signext i32 @testi32ult(i32 signext %c1, i32 signext %c2, i32 signext %c3
 ; CHECK-NO-ISEL-LABEL: testi32ult:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpw 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpw 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crandc 20, 2, 6
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB1_2
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB1_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB1_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB1_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:  .LBB1_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -81,14 +85,14 @@ define signext i32 @testi32sle(i32 signext %c1, i32 signext %c2, i32 signext %c3
 ; CHECK-NO-ISEL-LABEL: testi32sle:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpw 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpw 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crorc 20, 6, 2
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB2_2
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB2_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB2_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB2_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:  .LBB2_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -111,14 +115,14 @@ define signext i32 @testi32ule(i32 signext %c1, i32 signext %c2, i32 signext %c3
 ; CHECK-NO-ISEL-LABEL: testi32ule:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpw 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpw 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crorc 20, 2, 6
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB3_2
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB3_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB3_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB3_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:  .LBB3_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -145,10 +149,9 @@ define signext i32 @testi32eq(i32 signext %c1, i32 signext %c2, i32 signext %c3,
 ; CHECK-NO-ISEL-NEXT:    creqv 20, 6, 2
 ; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB4_2
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
 ; CHECK-NO-ISEL-NEXT:  .LBB4_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -171,14 +174,14 @@ define signext i32 @testi32sge(i32 signext %c1, i32 signext %c2, i32 signext %c3
 ; CHECK-NO-ISEL-LABEL: testi32sge:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpw 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpw 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crorc 20, 2, 6
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB5_2
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB5_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB5_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB5_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:  .LBB5_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -201,14 +204,14 @@ define signext i32 @testi32uge(i32 signext %c1, i32 signext %c2, i32 signext %c3
 ; CHECK-NO-ISEL-LABEL: testi32uge:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpw 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpw 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crorc 20, 6, 2
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB6_2
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB6_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB6_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB6_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:  .LBB6_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -231,14 +234,16 @@ define signext i32 @testi32sgt(i32 signext %c1, i32 signext %c2, i32 signext %c3
 ; CHECK-NO-ISEL-LABEL: testi32sgt:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpw 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpw 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crandc 20, 2, 6
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB7_2
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB7_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB7_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB7_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:  .LBB7_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -261,14 +266,16 @@ define signext i32 @testi32ugt(i32 signext %c1, i32 signext %c2, i32 signext %c3
 ; CHECK-NO-ISEL-LABEL: testi32ugt:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpw 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpw 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crandc 20, 6, 2
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB8_2
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB8_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
+; CHECK-NO-ISEL-NEXT:    cmpw 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB8_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB8_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:  .LBB8_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -295,10 +302,9 @@ define signext i32 @testi32ne(i32 signext %c1, i32 signext %c2, i32 signext %c3,
 ; CHECK-NO-ISEL-NEXT:    crxor 20, 6, 2
 ; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB9_2
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
 ; CHECK-NO-ISEL-NEXT:  .LBB9_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
@@ -321,14 +327,16 @@ define i64 @testi64slt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0
 ; CHECK-NO-ISEL-LABEL: testi64slt:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpd 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpd 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crandc 20, 6, 2
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB10_2
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB10_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
+; CHECK-NO-ISEL-NEXT:    cmpd 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB10_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB10_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:  .LBB10_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -351,14 +359,16 @@ define i64 @testi64ult(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0
 ; CHECK-NO-ISEL-LABEL: testi64ult:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpd 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpd 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crandc 20, 2, 6
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB11_2
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB11_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
+; CHECK-NO-ISEL-NEXT:    cmpd 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB11_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB11_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:  .LBB11_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -381,14 +391,14 @@ define i64 @testi64sle(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0
 ; CHECK-NO-ISEL-LABEL: testi64sle:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpd 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpd 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crorc 20, 6, 2
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB12_2
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB12_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB12_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    cmpd 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB12_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:  .LBB12_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -411,14 +421,14 @@ define i64 @testi64ule(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0
 ; CHECK-NO-ISEL-LABEL: testi64ule:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpd 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpd 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crorc 20, 2, 6
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB13_2
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB13_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB13_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    cmpd 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB13_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:  .LBB13_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -445,10 +455,9 @@ define i64 @testi64eq(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
 ; CHECK-NO-ISEL-NEXT:    creqv 20, 6, 2
 ; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB14_2
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
 ; CHECK-NO-ISEL-NEXT:  .LBB14_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -471,14 +480,14 @@ define i64 @testi64sge(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0
 ; CHECK-NO-ISEL-LABEL: testi64sge:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpd 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpd 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crorc 20, 2, 6
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB15_2
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB15_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB15_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    cmpd 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB15_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:  .LBB15_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -501,14 +510,14 @@ define i64 @testi64uge(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0
 ; CHECK-NO-ISEL-LABEL: testi64uge:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpd 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpd 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crorc 20, 6, 2
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB16_2
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB16_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB16_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    cmpd 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB16_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:  .LBB16_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -531,14 +540,16 @@ define i64 @testi64sgt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0
 ; CHECK-NO-ISEL-LABEL: testi64sgt:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpd 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpd 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crandc 20, 2, 6
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB17_2
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB17_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
+; CHECK-NO-ISEL-NEXT:    cmpd 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB17_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB17_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:  .LBB17_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -561,14 +572,16 @@ define i64 @testi64ugt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0
 ; CHECK-NO-ISEL-LABEL: testi64ugt:
 ; CHECK-NO-ISEL:       # %bb.0: # %entry
 ; CHECK-NO-ISEL-NEXT:    cmpd 5, 6
-; CHECK-NO-ISEL-NEXT:    cmpd 1, 3, 4
-; CHECK-NO-ISEL-NEXT:    crandc 20, 6, 2
-; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB18_2
+; CHECK-NO-ISEL-NEXT:    bc 12, 2, .LBB18_3
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
+; CHECK-NO-ISEL-NEXT:    cmpd 3, 4
+; CHECK-NO-ISEL-NEXT:    bc 4, 2, .LBB18_3
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
-; CHECK-NO-ISEL-NEXT:  .LBB18_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:  .LBB18_3: # %entry
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
@@ -595,10 +608,9 @@ define i64 @testi64ne(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
 ; CHECK-NO-ISEL-NEXT:    crxor 20, 6, 2
 ; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB19_2
 ; CHECK-NO-ISEL-NEXT:  # %bb.1: # %entry
-; CHECK-NO-ISEL-NEXT:    ori 3, 8, 0
-; CHECK-NO-ISEL-NEXT:    blr
+; CHECK-NO-ISEL-NEXT:    mr 7, 8
 ; CHECK-NO-ISEL-NEXT:  .LBB19_2: # %entry
-; CHECK-NO-ISEL-NEXT:    addi 3, 7, 0
+; CHECK-NO-ISEL-NEXT:    mr 3, 7
 ; CHECK-NO-ISEL-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
diff --git a/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll b/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
index f952c0e2d8fba..f696745c9d414 100644
--- a/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
+++ b/llvm/test/CodeGen/PowerPC/subreg-postra-2.ll
@@ -1,10 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gep-opt=0 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false -ppc-gep-opt=0 < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-isel -ppc-gep-opt=0 < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind
 define void @jbd2_journal_commit_transaction(i32 %input1, ptr %input2, ptr %input3, ptr %input4) #0 {
+; CHECK-LABEL: jbd2_journal_commit_transaction:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi 7, 3, 1
+; CHECK-NEXT:    cmplwi 1, 3, 0
+; CHECK-NEXT:    li 8, -5
+; CHECK-NEXT:    lis 9, 4
+; CHECK-NEXT:    cmpld 6, 4, 5
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: # %while.body392
+; CHECK-NEXT:    #
+; CHECK-NEXT:    bne- 1, .LBB0_4
+; CHECK-NEXT:  # %bb.2: # %wait_on_buffer.exit1319
+; CHECK-NEXT:    #
+; CHECK-NEXT:    ld 4, 0(6)
+; CHECK-NEXT:    mr 5, 4
+; CHECK-NEXT:    ldu 10, -72(5)
+; CHECK-NEXT:    andi. 10, 10, 1
+; CHECK-NEXT:    crmove 20, 1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    .long 2101356712
+; CHECK-NEXT:    andc 10, 10, 9
+; CHECK-NEXT:    stdcx. 10, 0, 5
+; CHECK-NEXT:    bne- 0, .Ltmp0
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    std 4, 0(6)
+; CHECK-NEXT:    bne+ 6, .LBB0_1
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    isel 7, 3, 8, 20
+; CHECK-NEXT:  .LBB0_4: # %while.end418
+; CHECK-NEXT:    cmplwi 7, 0
+; CHECK-NEXT:    beq 0, .LBB0_6
+; CHECK-NEXT:  # %bb.5: # %if.then420
+; CHECK-NEXT:  .LBB0_6: # %if.end421
+;
+; CHECK-NO-ISEL-LABEL: jbd2_journal_commit_transaction:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    addi 7, 3, 1
+; CHECK-NO-ISEL-NEXT:    cmplwi 1, 3, 0
+; CHECK-NO-ISEL-NEXT:    lis 8, 4
+; CHECK-NO-ISEL-NEXT:    cmpld 5, 4, 5
+; CHECK-NO-ISEL-NEXT:    b .LBB0_2
+; CHECK-NO-ISEL-NEXT:    .p2align 4
+; CHECK-NO-ISEL-NEXT:  .LBB0_1: # %wait_on_buffer.exit1319
+; CHECK-NO-ISEL-NEXT:    #
+; CHECK-NO-ISEL-NEXT:    #APP
+; CHECK-NO-ISEL-NEXT:  .Ltmp0:
+; CHECK-NO-ISEL-NEXT:    .long 2101364904
+; CHECK-NO-ISEL-NEXT:    andc 10, 10, 8
+; CHECK-NO-ISEL-NEXT:    stdcx. 10, 0, 9
+; CHECK-NO-ISEL-NEXT:    bne- 0, .Ltmp0
+; CHECK-NO-ISEL-EMPTY:
+; CHECK-NO-ISEL-NEXT:    #NO_APP
+; CHECK-NO-ISEL-NEXT:    std 5, 0(6)
+; CHECK-NO-ISEL-NEXT:    beq- 5, .LBB0_6
+; CHECK-NO-ISEL-NEXT:  .LBB0_2: # %while.body392
+; CHECK-NO-ISEL-NEXT:    #
+; CHECK-NO-ISEL-NEXT:    bne- 1, .LBB0_5
+; CHECK-NO-ISEL-NEXT:  # %bb.3: # %wait_on_buffer.exit1319
+; CHECK-NO-ISEL-NEXT:    #
+; CHECK-NO-ISEL-NEXT:    ld 5, 0(6)
+; CHECK-NO-ISEL-NEXT:    mr 9, 5
+; CHECK-NO-ISEL-NEXT:    ldu 4, -72(9)
+; CHECK-NO-ISEL-NEXT:    andi. 4, 4, 1
+; CHECK-NO-ISEL-NEXT:    mr 4, 3
+; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB0_1
+; CHECK-NO-ISEL-NEXT:  # %bb.4: # %wait_on_buffer.exit1319
+; CHECK-NO-ISEL-NEXT:    #
+; CHECK-NO-ISEL-NEXT:    li 4, -5
+; CHECK-NO-ISEL-NEXT:    b .LBB0_1
+; CHECK-NO-ISEL-NEXT:  .LBB0_5:
+; CHECK-NO-ISEL-NEXT:    mr 4, 7
+; CHECK-NO-ISEL-NEXT:  .LBB0_6: # %while.end418
+; CHECK-NO-ISEL-NEXT:    cmplwi 4, 0
+; CHECK-NO-ISEL-NEXT:    beq 0, .LBB0_8
+; CHECK-NO-ISEL-NEXT:  # %bb.7: # %if.then420
+; CHECK-NO-ISEL-NEXT:  .LBB0_8: # %if.end421
 entry:
   br label %while.body392
 
@@ -30,17 +109,6 @@ while.end418:                                     ; preds = %wait_on_buffer.exit
   %tobool419 = icmp eq i32 %err.4.lcssa, 0
   br i1 %tobool419, label %if.end421, label %if.then420
 
-; CHECK-LABEL: @jbd2_journal_commit_transaction
-; CHECK-NO-ISEL-LABEL: @jbd2_journal_commit_transaction
-; CHECK: andi.
-; CHECK: crmove [[REG:[0-9]+]], 1
-; CHECK: stdcx.
-; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]]
-; CHECK-NO-ISEL: bc 12, 20, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 7, 8, 0
-; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
-; CHECK-NO-ISEL: [[TRUE]]
-; CHECK-NO-ISEL: addi 7, 3, 0
 
 if.then420:                                       ; preds = %while.end418
   unreachable
diff --git a/llvm/test/CodeGen/PowerPC/subreg-postra.ll b/llvm/test/CodeGen/PowerPC/subreg-postra.ll
index 32a1b85cac8f5..a315da545ba0f 100644
--- a/llvm/test/CodeGen/PowerPC/subreg-postra.ll
+++ b/llvm/test/CodeGen/PowerPC/subreg-postra.ll
@@ -1,10 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-isel < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind
 define void @jbd2_journal_commit_transaction(ptr %journal, i64 %inp1, i32 %inp2,
+; CHECK-LABEL: jbd2_journal_commit_transaction:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mfcr 12
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stw 12, 8(1)
+; CHECK-NEXT:    stdu 1, -176(1)
+; CHECK-NEXT:    lbz 6, 295(1)
+; CHECK-NEXT:    std 0, 192(1)
+; CHECK-NEXT:    andi. 6, 6, 1
+; CHECK-NEXT:    std 25, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 26, 128(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 27, 136(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 28, 144(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 29, 152(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 30, 160(1) # 8-byte Folded Spill
+; CHECK-NEXT:    crmove 9, 1
+; CHECK-NEXT:    andi. 6, 10, 1
+; CHECK-NEXT:    crmove 8, 1
+; CHECK-NEXT:    andi. 6, 9, 1
+; CHECK-NEXT:    bc 4, 20, .LBB0_24
+; CHECK-NEXT:  # %bb.1: # %do.body
+; CHECK-NEXT:    bc 4, 20, .LBB0_25
+; CHECK-NEXT:  # %bb.2: # %trace_jbd2_start_commit.exit
+; CHECK-NEXT:    mr 30, 8
+; CHECK-NEXT:    mr 29, 7
+; CHECK-NEXT:    bc 12, 20, .LBB0_4
+; CHECK-NEXT:  # %bb.3: # %do.body.i1116
+; CHECK-NEXT:    bc 4, 20, .LBB0_26
+; CHECK-NEXT:  .LBB0_4: # %trace_jbd2_commit_locking.exit
+; CHECK-NEXT:    bc 4, 20, .LBB0_27
+; CHECK-NEXT:  # %bb.5: # %spin_unlock.exit1146
+; CHECK-NEXT:    bc 4, 20, .LBB0_28
+; CHECK-NEXT:  # %bb.6: # %trace_jbd2_commit_flushing.exit
+; CHECK-NEXT:    bc 4, 20, .LBB0_29
+; CHECK-NEXT:  # %bb.7: # %for.end.i
+; CHECK-NEXT:    bc 4, 20, .LBB0_31
+; CHECK-NEXT:  # %bb.8: # %journal_submit_data_buffers.exit
+; CHECK-NEXT:    bc 4, 20, .LBB0_32
+; CHECK-NEXT:  # %bb.9: # %if.end103
+; CHECK-NEXT:    bc 4, 20, .LBB0_33
+; CHECK-NEXT:  # %bb.10: # %trace_jbd2_commit_logging.exit
+; CHECK-NEXT:    bc 4, 20, .LBB0_34
+; CHECK-NEXT:  # %bb.11: # %for.end.i1287
+; CHECK-NEXT:    bc 4, 20, .LBB0_35
+; CHECK-NEXT:  # %bb.12: # %journal_finish_inode_data_buffers.exit
+; CHECK-NEXT:    bc 4, 20, .LBB0_36
+; CHECK-NEXT:  # %bb.13: # %if.end256
+; CHECK-NEXT:    cmpdi 1, 4, 0
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_14: # %while.body318
+; CHECK-NEXT:    #
+; CHECK-NEXT:    bc 4, 6, .LBB0_19
+; CHECK-NEXT:  # %bb.15: # %wait_on_buffer.exit
+; CHECK-NEXT:    #
+; CHECK-NEXT:    bc 4, 1, .LBB0_14
+; CHECK-NEXT:  # %bb.16: # %do.body378
+; CHECK-NEXT:    bc 4, 8, .LBB0_20
+; CHECK-NEXT:  # %bb.17: # %while.end418
+; CHECK-NEXT:    bc 4, 8, .LBB0_23
+; CHECK-NEXT:  .LBB0_18: # %if.end421
+; CHECK-NEXT:  .LBB0_19: # %if.then.i1296
+; CHECK-NEXT:  .LBB0_20: # %while.body392.lr.ph
+; CHECK-NEXT:    lis 26, 4
+; CHECK-NEXT:    mr 27, 5
+; CHECK-NEXT:    mr 28, 3
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_21: # %while.body392
+; CHECK-NEXT:    #
+; CHECK-NEXT:    ld 3, 0(3)
+; CHECK-NEXT:    ldu 25, -72(3)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    .long 2088769704
+; CHECK-NEXT:    andc 4, 4, 26
+; CHECK-NEXT:    stdcx. 4, 0, 3
+; CHECK-NEXT:    bne- 0, .Ltmp0
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ld 3, 0(29)
+; CHECK-NEXT:    std 3, 0(30)
+; CHECK-NEXT:    bl __brelse
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    bc 4, 9, .LBB0_21
+; CHECK-NEXT:  # %bb.22: # %while.end418.loopexit
+; CHECK-NEXT:    andi. 3, 25, 1
+; CHECK-NEXT:    li 3, -5
+; CHECK-NEXT:    mr 5, 27
+; CHECK-NEXT:    iselgt 5, 5, 3
+; CHECK-NEXT:    mr 3, 28
+; CHECK-NEXT:    bc 12, 8, .LBB0_18
+; CHECK-NEXT:  .LBB0_23: # %if.then420
+; CHECK-NEXT:    extsw 4, 5
+; CHECK-NEXT:    bl jbd2_journal_abort
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  .LBB0_24: # %if.then5
+; CHECK-NEXT:  .LBB0_25: # %do.body.i
+; CHECK-NEXT:  .LBB0_26: # %do.body5.i1122
+; CHECK-NEXT:  .LBB0_27: # %if.then.i.i.i.i1144
+; CHECK-NEXT:  .LBB0_28: # %do.body.i1159
+; CHECK-NEXT:  .LBB0_29: # %for.body.lr.ph.i
+; CHECK-NEXT:    bc 4, 20, .LBB0_37
+; CHECK-NEXT:  # %bb.30: # %spin_unlock.exit.i
+; CHECK-NEXT:  .LBB0_31: # %if.then.i.i.i.i31.i
+; CHECK-NEXT:  .LBB0_32: # %if.then102
+; CHECK-NEXT:  .LBB0_33: # %do.body.i1182
+; CHECK-NEXT:  .LBB0_34: # %for.body.i1277
+; CHECK-NEXT:  .LBB0_35: # %if.then.i.i.i.i84.i
+; CHECK-NEXT:  .LBB0_36: # %if.then249
+; CHECK-NEXT:  .LBB0_37: # %if.then.i.i.i.i.i
+;
+; CHECK-NO-ISEL-LABEL: jbd2_journal_commit_transaction:
+; CHECK-NO-ISEL:       # %bb.0: # %entry
+; CHECK-NO-ISEL-NEXT:    mfcr 12
+; CHECK-NO-ISEL-NEXT:    mflr 0
+; CHECK-NO-ISEL-NEXT:    stw 12, 8(1)
+; CHECK-NO-ISEL-NEXT:    stdu 1, -176(1)
+; CHECK-NO-ISEL-NEXT:    lbz 6, 295(1)
+; CHECK-NO-ISEL-NEXT:    std 0, 192(1)
+; CHECK-NO-ISEL-NEXT:    andi. 6, 6, 1
+; CHECK-NO-ISEL-NEXT:    std 25, 120(1) # 8-byte Folded Spill
+; CHECK-NO-ISEL-NEXT:    std 26, 128(1) # 8-byte Folded Spill
+; CHECK-NO-ISEL-NEXT:    std 27, 136(1) # 8-byte Folded Spill
+; CHECK-NO-ISEL-NEXT:    std 28, 144(1) # 8-byte Folded Spill
+; CHECK-NO-ISEL-NEXT:    std 29, 152(1) # 8-byte Folded Spill
+; CHECK-NO-ISEL-NEXT:    std 30, 160(1) # 8-byte Folded Spill
+; CHECK-NO-ISEL-NEXT:    crmove 9, 1
+; CHECK-NO-ISEL-NEXT:    andi. 6, 10, 1
+; CHECK-NO-ISEL-NEXT:    crmove 8, 1
+; CHECK-NO-ISEL-NEXT:    andi. 6, 9, 1
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_26
+; CHECK-NO-ISEL-NEXT:  # %bb.1: # %do.body
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_27
+; CHECK-NO-ISEL-NEXT:  # %bb.2: # %trace_jbd2_start_commit.exit
+; CHECK-NO-ISEL-NEXT:    mr 30, 8
+; CHECK-NO-ISEL-NEXT:    mr 29, 7
+; CHECK-NO-ISEL-NEXT:    bc 12, 20, .LBB0_4
+; CHECK-NO-ISEL-NEXT:  # %bb.3: # %do.body.i1116
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_28
+; CHECK-NO-ISEL-NEXT:  .LBB0_4: # %trace_jbd2_commit_locking.exit
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_29
+; CHECK-NO-ISEL-NEXT:  # %bb.5: # %spin_unlock.exit1146
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_30
+; CHECK-NO-ISEL-NEXT:  # %bb.6: # %trace_jbd2_commit_flushing.exit
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_31
+; CHECK-NO-ISEL-NEXT:  # %bb.7: # %for.end.i
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_33
+; CHECK-NO-ISEL-NEXT:  # %bb.8: # %journal_submit_data_buffers.exit
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_34
+; CHECK-NO-ISEL-NEXT:  # %bb.9: # %if.end103
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_35
+; CHECK-NO-ISEL-NEXT:  # %bb.10: # %trace_jbd2_commit_logging.exit
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_36
+; CHECK-NO-ISEL-NEXT:  # %bb.11: # %for.end.i1287
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_37
+; CHECK-NO-ISEL-NEXT:  # %bb.12: # %journal_finish_inode_data_buffers.exit
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_38
+; CHECK-NO-ISEL-NEXT:  # %bb.13: # %if.end256
+; CHECK-NO-ISEL-NEXT:    cmpdi 1, 4, 0
+; CHECK-NO-ISEL-NEXT:    .p2align 4
+; CHECK-NO-ISEL-NEXT:  .LBB0_14: # %while.body318
+; CHECK-NO-ISEL-NEXT:    #
+; CHECK-NO-ISEL-NEXT:    bc 4, 6, .LBB0_19
+; CHECK-NO-ISEL-NEXT:  # %bb.15: # %wait_on_buffer.exit
+; CHECK-NO-ISEL-NEXT:    #
+; CHECK-NO-ISEL-NEXT:    bc 4, 1, .LBB0_14
+; CHECK-NO-ISEL-NEXT:  # %bb.16: # %do.body378
+; CHECK-NO-ISEL-NEXT:    bc 4, 8, .LBB0_20
+; CHECK-NO-ISEL-NEXT:  # %bb.17: # %while.end418
+; CHECK-NO-ISEL-NEXT:    bc 4, 8, .LBB0_25
+; CHECK-NO-ISEL-NEXT:  .LBB0_18: # %if.end421
+; CHECK-NO-ISEL-NEXT:  .LBB0_19: # %if.then.i1296
+; CHECK-NO-ISEL-NEXT:  .LBB0_20: # %while.body392.lr.ph
+; CHECK-NO-ISEL-NEXT:    lis 26, 4
+; CHECK-NO-ISEL-NEXT:    mr 27, 5
+; CHECK-NO-ISEL-NEXT:    mr 28, 3
+; CHECK-NO-ISEL-NEXT:    .p2align 4
+; CHECK-NO-ISEL-NEXT:  .LBB0_21: # %while.body392
+; CHECK-NO-ISEL-NEXT:    #
+; CHECK-NO-ISEL-NEXT:    ld 3, 0(3)
+; CHECK-NO-ISEL-NEXT:    ldu 25, -72(3)
+; CHECK-NO-ISEL-NEXT:    #APP
+; CHECK-NO-ISEL-NEXT:  .Ltmp0:
+; CHECK-NO-ISEL-NEXT:    .long 2088769704
+; CHECK-NO-ISEL-NEXT:    andc 4, 4, 26
+; CHECK-NO-ISEL-NEXT:    stdcx. 4, 0, 3
+; CHECK-NO-ISEL-NEXT:    bne- 0, .Ltmp0
+; CHECK-NO-ISEL-EMPTY:
+; CHECK-NO-ISEL-NEXT:    #NO_APP
+; CHECK-NO-ISEL-NEXT:    ld 3, 0(29)
+; CHECK-NO-ISEL-NEXT:    std 3, 0(30)
+; CHECK-NO-ISEL-NEXT:    bl __brelse
+; CHECK-NO-ISEL-NEXT:    nop
+; CHECK-NO-ISEL-NEXT:    bc 4, 9, .LBB0_21
+; CHECK-NO-ISEL-NEXT:  # %bb.22: # %while.end418.loopexit
+; CHECK-NO-ISEL-NEXT:    andi. 3, 25, 1
+; CHECK-NO-ISEL-NEXT:    mr 5, 27
+; CHECK-NO-ISEL-NEXT:    bc 12, 1, .LBB0_24
+; CHECK-NO-ISEL-NEXT:  # %bb.23: # %while.end418.loopexit
+; CHECK-NO-ISEL-NEXT:    li 5, -5
+; CHECK-NO-ISEL-NEXT:  .LBB0_24: # %while.end418.loopexit
+; CHECK-NO-ISEL-NEXT:    mr 3, 28
+; CHECK-NO-ISEL-NEXT:    bc 12, 8, .LBB0_18
+; CHECK-NO-ISEL-NEXT:  .LBB0_25: # %if.then420
+; CHECK-NO-ISEL-NEXT:    extsw 4, 5
+; CHECK-NO-ISEL-NEXT:    bl jbd2_journal_abort
+; CHECK-NO-ISEL-NEXT:    nop
+; CHECK-NO-ISEL-NEXT:  .LBB0_26: # %if.then5
+; CHECK-NO-ISEL-NEXT:  .LBB0_27: # %do.body.i
+; CHECK-NO-ISEL-NEXT:  .LBB0_28: # %do.body5.i1122
+; CHECK-NO-ISEL-NEXT:  .LBB0_29: # %if.then.i.i.i.i1144
+; CHECK-NO-ISEL-NEXT:  .LBB0_30: # %do.body.i1159
+; CHECK-NO-ISEL-NEXT:  .LBB0_31: # %for.body.lr.ph.i
+; CHECK-NO-ISEL-NEXT:    bc 4, 20, .LBB0_39
+; CHECK-NO-ISEL-NEXT:  # %bb.32: # %spin_unlock.exit.i
+; CHECK-NO-ISEL-NEXT:  .LBB0_33: # %if.then.i.i.i.i31.i
+; CHECK-NO-ISEL-NEXT:  .LBB0_34: # %if.then102
+; CHECK-NO-ISEL-NEXT:  .LBB0_35: # %do.body.i1182
+; CHECK-NO-ISEL-NEXT:  .LBB0_36: # %for.body.i1277
+; CHECK-NO-ISEL-NEXT:  .LBB0_37: # %if.then.i.i.i.i84.i
+; CHECK-NO-ISEL-NEXT:  .LBB0_38: # %if.then249
+; CHECK-NO-ISEL-NEXT:  .LBB0_39: # %if.then.i.i.i.i.i
                                              ptr %inp3, ptr %inp4,
                                              ptr %inp5, i1 %inp6,
                                              i1 %inp7, i1 %inp8) #0 {
@@ -144,15 +366,6 @@ wait_on_buffer.exit1319:                          ; preds = %while.body392
   call void @__brelse(ptr %3) #1
   br i1 %inp8, label %while.end418, label %while.body392
 
-; CHECK-LABEL: @jbd2_journal_commit_transaction
-; CHECK-NO-ISEL-LABEL: @jbd2_journal_commit_transaction
-; CHECK: andi.
-; CHECK: crmove
-; CHECK: stdcx.
-; CHECK: iselgt {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NO-ISEL: bc 12, 1, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 5, 3, 0
-; CHECK-NO-ISEL: b [[SUCCESSOR:.LBB[0-9]+]]
 
 
 while.end418:                                     ; preds = %wait_on_buffer.exit1319, %do.body378
diff --git a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
index 1dbb060fc35fa..2e7ca02531470 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
@@ -260,13 +260,13 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    sw s9, 68(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    sw s11, 60(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 112(sp)
+; ZHINX32-NEXT:    lh t0, 124(sp)
 ; ZHINX32-NEXT:    sw t0, 56(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 116(sp)
-; ZHINX32-NEXT:    sw t0, 52(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    lh t0, 120(sp)
+; ZHINX32-NEXT:    sw t0, 52(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    lh t0, 116(sp)
 ; ZHINX32-NEXT:    sw t0, 48(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 124(sp)
+; ZHINX32-NEXT:    lh t0, 112(sp)
 ; ZHINX32-NEXT:    sw t0, 44(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    lh t6, 128(sp)
 ; ZHINX32-NEXT:    lh t5, 132(sp)
@@ -308,10 +308,10 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    sh t4, 4(sp)
 ; ZHINX32-NEXT:    sh t5, 2(sp)
 ; ZHINX32-NEXT:    sh t6, 0(sp)
-; ZHINX32-NEXT:    lw t3, 56(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t4, 52(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t5, 48(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t6, 44(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t3, 44(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t4, 48(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t5, 52(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw t6, 56(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    call callee_half_32
 ; ZHINX32-NEXT:    lw ra, 108(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    lw s0, 104(sp) # 4-byte Folded Reload
@@ -345,13 +345,13 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    sd s9, 88(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    sd s10, 80(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    sd s11, 72(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 176(sp)
+; ZHINX64-NEXT:    lh t0, 200(sp)
 ; ZHINX64-NEXT:    sd t0, 64(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 184(sp)
-; ZHINX64-NEXT:    sd t0, 56(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    lh t0, 192(sp)
+; ZHINX64-NEXT:    sd t0, 56(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    lh t0, 184(sp)
 ; ZHINX64-NEXT:    sd t0, 48(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 200(sp)
+; ZHINX64-NEXT:    lh t0, 176(sp)
 ; ZHINX64-NEXT:    sd t0, 40(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    lh t6, 208(sp)
 ; ZHINX64-NEXT:    lh t5, 216(sp)
@@ -393,10 +393,10 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    sh t4, 4(sp)
 ; ZHINX64-NEXT:    sh t5, 2(sp)
 ; ZHINX64-NEXT:    sh t6, 0(sp)
-; ZHINX64-NEXT:    ld t3, 64(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t4, 56(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t5, 48(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t6, 40(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t3, 40(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t4, 48(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t5, 56(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t6, 64(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    call callee_half_32
 ; ZHINX64-NEXT:    ld ra, 168(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    ld s0, 160(sp) # 8-byte Folded Reload
@@ -832,32 +832,28 @@ define fastcc float @callee_float_32(<32 x float> %A) nounwind {
 define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-LABEL: caller_float_32:
 ; ZHINX32:       # %bb.0:
-; ZHINX32-NEXT:    addi sp, sp, -160
-; ZHINX32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    addi sp, sp, -144
+; ZHINX32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    lw t0, 160(sp)
-; ZHINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 164(sp)
-; ZHINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 168(sp)
-; ZHINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 172(sp)
-; ZHINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t6, 176(sp)
-; ZHINX32-NEXT:    lw t5, 180(sp)
-; ZHINX32-NEXT:    lw t4, 184(sp)
-; ZHINX32-NEXT:    lw s0, 188(sp)
+; ZHINX32-NEXT:    lw t1, 164(sp)
+; ZHINX32-NEXT:    lw t2, 168(sp)
+; ZHINX32-NEXT:    lw s0, 172(sp)
+; ZHINX32-NEXT:    lw t3, 176(sp)
+; ZHINX32-NEXT:    lw t4, 180(sp)
+; ZHINX32-NEXT:    lw t5, 184(sp)
+; ZHINX32-NEXT:    lw t6, 188(sp)
 ; ZHINX32-NEXT:    lw s1, 192(sp)
 ; ZHINX32-NEXT:    lw s2, 196(sp)
 ; ZHINX32-NEXT:    lw s3, 200(sp)
@@ -870,49 +866,45 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-NEXT:    lw s10, 228(sp)
 ; ZHINX32-NEXT:    lw s11, 232(sp)
 ; ZHINX32-NEXT:    lw ra, 236(sp)
-; ZHINX32-NEXT:    lw t3, 240(sp)
-; ZHINX32-NEXT:    lw t2, 244(sp)
-; ZHINX32-NEXT:    lw t1, 248(sp)
-; ZHINX32-NEXT:    lw t0, 252(sp)
-; ZHINX32-NEXT:    sw t0, 76(sp)
-; ZHINX32-NEXT:    sw t1, 72(sp)
-; ZHINX32-NEXT:    sw t2, 68(sp)
-; ZHINX32-NEXT:    sw t3, 64(sp)
-; ZHINX32-NEXT:    sw ra, 60(sp)
-; ZHINX32-NEXT:    sw s11, 56(sp)
-; ZHINX32-NEXT:    sw s10, 52(sp)
-; ZHINX32-NEXT:    sw s9, 48(sp)
-; ZHINX32-NEXT:    sw s8, 44(sp)
-; ZHINX32-NEXT:    sw s7, 40(sp)
-; ZHINX32-NEXT:    sw s6, 36(sp)
-; ZHINX32-NEXT:    sw s5, 32(sp)
-; ZHINX32-NEXT:    sw s4, 28(sp)
-; ZHINX32-NEXT:    sw s3, 24(sp)
-; ZHINX32-NEXT:    sw s2, 20(sp)
-; ZHINX32-NEXT:    sw s1, 16(sp)
+; ZHINX32-NEXT:    sw ra, 76(sp)
+; ZHINX32-NEXT:    sw s11, 72(sp)
+; ZHINX32-NEXT:    sw s10, 68(sp)
+; ZHINX32-NEXT:    sw s9, 64(sp)
+; ZHINX32-NEXT:    sw s8, 60(sp)
+; ZHINX32-NEXT:    sw s7, 56(sp)
+; ZHINX32-NEXT:    sw s6, 52(sp)
+; ZHINX32-NEXT:    sw s5, 48(sp)
+; ZHINX32-NEXT:    sw s4, 44(sp)
+; ZHINX32-NEXT:    sw s3, 40(sp)
+; ZHINX32-NEXT:    sw s2, 36(sp)
+; ZHINX32-NEXT:    sw s1, 32(sp)
+; ZHINX32-NEXT:    sw t6, 28(sp)
+; ZHINX32-NEXT:    sw t5, 24(sp)
+; ZHINX32-NEXT:    sw t4, 20(sp)
+; ZHINX32-NEXT:    sw t3, 16(sp)
+; ZHINX32-NEXT:    lw t3, 144(sp)
+; ZHINX32-NEXT:    lw t4, 148(sp)
+; ZHINX32-NEXT:    lw t5, 152(sp)
+; ZHINX32-NEXT:    lw t6, 156(sp)
 ; ZHINX32-NEXT:    sw s0, 12(sp)
-; ZHINX32-NEXT:    sw t4, 8(sp)
-; ZHINX32-NEXT:    sw t5, 4(sp)
-; ZHINX32-NEXT:    sw t6, 0(sp)
-; ZHINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    sw t2, 8(sp)
+; ZHINX32-NEXT:    sw t1, 4(sp)
+; ZHINX32-NEXT:    sw t0, 0(sp)
 ; ZHINX32-NEXT:    call callee_float_32
-; ZHINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    addi sp, sp, 160
+; ZHINX32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    addi sp, sp, 144
 ; ZHINX32-NEXT:    ret
 ;
 ; ZHINX64-LABEL: caller_float_32:
@@ -931,13 +923,13 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 224(sp)
+; ZHINX64-NEXT:    lw t0, 248(sp)
 ; ZHINX64-NEXT:    sd t0, 112(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 232(sp)
-; ZHINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    lw t0, 240(sp)
+; ZHINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
+; ZHINX64-NEXT:    lw t0, 232(sp)
 ; ZHINX64-NEXT:    sd t0, 96(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 248(sp)
+; ZHINX64-NEXT:    lw t0, 224(sp)
 ; ZHINX64-NEXT:    sd t0, 88(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    lw t6, 256(sp)
 ; ZHINX64-NEXT:    lw t5, 264(sp)
@@ -979,10 +971,10 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    sw t4, 8(sp)
 ; ZHINX64-NEXT:    sw t5, 4(sp)
 ; ZHINX64-NEXT:    sw t6, 0(sp)
-; ZHINX64-NEXT:    ld t3, 112(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t4, 104(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t5, 96(sp) # 8-byte Folded Reload
-; ZHINX64-NEXT:    ld t6, 88(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t3, 88(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t5, 104(sp) # 8-byte Folded Reload
+; ZHINX64-NEXT:    ld t6, 112(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    call callee_float_32
 ; ZHINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
@@ -1002,32 +994,28 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ;
 ; ZFINX32-LABEL: caller_float_32:
 ; ZFINX32:       # %bb.0:
-; ZFINX32-NEXT:    addi sp, sp, -160
-; ZFINX32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    addi sp, sp, -144
+; ZFINX32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
 ; ZFINX32-NEXT:    lw t0, 160(sp)
-; ZFINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 164(sp)
-; ZFINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 168(sp)
-; ZFINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 172(sp)
-; ZFINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t6, 176(sp)
-; ZFINX32-NEXT:    lw t5, 180(sp)
-; ZFINX32-NEXT:    lw t4, 184(sp)
-; ZFINX32-NEXT:    lw s0, 188(sp)
+; ZFINX32-NEXT:    lw t1, 164(sp)
+; ZFINX32-NEXT:    lw t2, 168(sp)
+; ZFINX32-NEXT:    lw s0, 172(sp)
+; ZFINX32-NEXT:    lw t3, 176(sp)
+; ZFINX32-NEXT:    lw t4, 180(sp)
+; ZFINX32-NEXT:    lw t5, 184(sp)
+; ZFINX32-NEXT:    lw t6, 188(sp)
 ; ZFINX32-NEXT:    lw s1, 192(sp)
 ; ZFINX32-NEXT:    lw s2, 196(sp)
 ; ZFINX32-NEXT:    lw s3, 200(sp)
@@ -1040,49 +1028,45 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX32-NEXT:    lw s10, 228(sp)
 ; ZFINX32-NEXT:    lw s11, 232(sp)
 ; ZFINX32-NEXT:    lw ra, 236(sp)
-; ZFINX32-NEXT:    lw t3, 240(sp)
-; ZFINX32-NEXT:    lw t2, 244(sp)
-; ZFINX32-NEXT:    lw t1, 248(sp)
-; ZFINX32-NEXT:    lw t0, 252(sp)
-; ZFINX32-NEXT:    sw t0, 76(sp)
-; ZFINX32-NEXT:    sw t1, 72(sp)
-; ZFINX32-NEXT:    sw t2, 68(sp)
-; ZFINX32-NEXT:    sw t3, 64(sp)
-; ZFINX32-NEXT:    sw ra, 60(sp)
-; ZFINX32-NEXT:    sw s11, 56(sp)
-; ZFINX32-NEXT:    sw s10, 52(sp)
-; ZFINX32-NEXT:    sw s9, 48(sp)
-; ZFINX32-NEXT:    sw s8, 44(sp)
-; ZFINX32-NEXT:    sw s7, 40(sp)
-; ZFINX32-NEXT:    sw s6, 36(sp)
-; ZFINX32-NEXT:    sw s5, 32(sp)
-; ZFINX32-NEXT:    sw s4, 28(sp)
-; ZFINX32-NEXT:    sw s3, 24(sp)
-; ZFINX32-NEXT:    sw s2, 20(sp)
-; ZFINX32-NEXT:    sw s1, 16(sp)
+; ZFINX32-NEXT:    sw ra, 76(sp)
+; ZFINX32-NEXT:    sw s11, 72(sp)
+; ZFINX32-NEXT:    sw s10, 68(sp)
+; ZFINX32-NEXT:    sw s9, 64(sp)
+; ZFINX32-NEXT:    sw s8, 60(sp)
+; ZFINX32-NEXT:    sw s7, 56(sp)
+; ZFINX32-NEXT:    sw s6, 52(sp)
+; ZFINX32-NEXT:    sw s5, 48(sp)
+; ZFINX32-NEXT:    sw s4, 44(sp)
+; ZFINX32-NEXT:    sw s3, 40(sp)
+; ZFINX32-NEXT:    sw s2, 36(sp)
+; ZFINX32-NEXT:    sw s1, 32(sp)
+; ZFINX32-NEXT:    sw t6, 28(sp)
+; ZFINX32-NEXT:    sw t5, 24(sp)
+; ZFINX32-NEXT:    sw t4, 20(sp)
+; ZFINX32-NEXT:    sw t3, 16(sp)
+; ZFINX32-NEXT:    lw t3, 144(sp)
+; ZFINX32-NEXT:    lw t4, 148(sp)
+; ZFINX32-NEXT:    lw t5, 152(sp)
+; ZFINX32-NEXT:    lw t6, 156(sp)
 ; ZFINX32-NEXT:    sw s0, 12(sp)
-; ZFINX32-NEXT:    sw t4, 8(sp)
-; ZFINX32-NEXT:    sw t5, 4(sp)
-; ZFINX32-NEXT:    sw t6, 0(sp)
-; ZFINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    sw t2, 8(sp)
+; ZFINX32-NEXT:    sw t1, 4(sp)
+; ZFINX32-NEXT:    sw t0, 0(sp)
 ; ZFINX32-NEXT:    call callee_float_32
-; ZFINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    addi sp, sp, 160
+; ZFINX32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    addi sp, sp, 144
 ; ZFINX32-NEXT:    ret
 ;
 ; ZFINX64-LABEL: caller_float_32:
@@ -1101,13 +1085,13 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 224(sp)
+; ZFINX64-NEXT:    lw t0, 248(sp)
 ; ZFINX64-NEXT:    sd t0, 112(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 232(sp)
-; ZFINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    lw t0, 240(sp)
+; ZFINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
+; ZFINX64-NEXT:    lw t0, 232(sp)
 ; ZFINX64-NEXT:    sd t0, 96(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 248(sp)
+; ZFINX64-NEXT:    lw t0, 224(sp)
 ; ZFINX64-NEXT:    sd t0, 88(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    lw t6, 256(sp)
 ; ZFINX64-NEXT:    lw t5, 264(sp)
@@ -1149,10 +1133,10 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    sw t4, 8(sp)
 ; ZFINX64-NEXT:    sw t5, 4(sp)
 ; ZFINX64-NEXT:    sw t6, 0(sp)
-; ZFINX64-NEXT:    ld t3, 112(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t4, 104(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t5, 96(sp) # 8-byte Folded Reload
-; ZFINX64-NEXT:    ld t6, 88(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld t3, 88(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld t5, 104(sp) # 8-byte Folded Reload
+; ZFINX64-NEXT:    ld t6, 112(sp) # 8-byte Folded Reload
 ; ZFINX64-NEXT:    call callee_float_32
 ; ZFINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; ZFINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
@@ -1172,32 +1156,28 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ;
 ; ZDINX32-LABEL: caller_float_32:
 ; ZDINX32:       # %bb.0:
-; ZDINX32-NEXT:    addi sp, sp, -160
-; ZDINX32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    addi sp, sp, -144
+; ZDINX32-NEXT:    sw ra, 140(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s0, 136(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s1, 132(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s2, 128(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s3, 124(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s4, 120(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s5, 116(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s6, 112(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s7, 108(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s8, 104(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
 ; ZDINX32-NEXT:    lw t0, 160(sp)
-; ZDINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 164(sp)
-; ZDINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 168(sp)
-; ZDINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 172(sp)
-; ZDINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t6, 176(sp)
-; ZDINX32-NEXT:    lw t5, 180(sp)
-; ZDINX32-NEXT:    lw t4, 184(sp)
-; ZDINX32-NEXT:    lw s0, 188(sp)
+; ZDINX32-NEXT:    lw t1, 164(sp)
+; ZDINX32-NEXT:    lw t2, 168(sp)
+; ZDINX32-NEXT:    lw s0, 172(sp)
+; ZDINX32-NEXT:    lw t3, 176(sp)
+; ZDINX32-NEXT:    lw t4, 180(sp)
+; ZDINX32-NEXT:    lw t5, 184(sp)
+; ZDINX32-NEXT:    lw t6, 188(sp)
 ; ZDINX32-NEXT:    lw s1, 192(sp)
 ; ZDINX32-NEXT:    lw s2, 196(sp)
 ; ZDINX32-NEXT:    lw s3, 200(sp)
@@ -1210,49 +1190,45 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX32-NEXT:    lw s10, 228(sp)
 ; ZDINX32-NEXT:    lw s11, 232(sp)
 ; ZDINX32-NEXT:    lw ra, 236(sp)
-; ZDINX32-NEXT:    lw t3, 240(sp)
-; ZDINX32-NEXT:    lw t2, 244(sp)
-; ZDINX32-NEXT:    lw t1, 248(sp)
-; ZDINX32-NEXT:    lw t0, 252(sp)
-; ZDINX32-NEXT:    sw t0, 76(sp)
-; ZDINX32-NEXT:    sw t1, 72(sp)
-; ZDINX32-NEXT:    sw t2, 68(sp)
-; ZDINX32-NEXT:    sw t3, 64(sp)
-; ZDINX32-NEXT:    sw ra, 60(sp)
-; ZDINX32-NEXT:    sw s11, 56(sp)
-; ZDINX32-NEXT:    sw s10, 52(sp)
-; ZDINX32-NEXT:    sw s9, 48(sp)
-; ZDINX32-NEXT:    sw s8, 44(sp)
-; ZDINX32-NEXT:    sw s7, 40(sp)
-; ZDINX32-NEXT:    sw s6, 36(sp)
-; ZDINX32-NEXT:    sw s5, 32(sp)
-; ZDINX32-NEXT:    sw s4, 28(sp)
-; ZDINX32-NEXT:    sw s3, 24(sp)
-; ZDINX32-NEXT:    sw s2, 20(sp)
-; ZDINX32-NEXT:    sw s1, 16(sp)
+; ZDINX32-NEXT:    sw ra, 76(sp)
+; ZDINX32-NEXT:    sw s11, 72(sp)
+; ZDINX32-NEXT:    sw s10, 68(sp)
+; ZDINX32-NEXT:    sw s9, 64(sp)
+; ZDINX32-NEXT:    sw s8, 60(sp)
+; ZDINX32-NEXT:    sw s7, 56(sp)
+; ZDINX32-NEXT:    sw s6, 52(sp)
+; ZDINX32-NEXT:    sw s5, 48(sp)
+; ZDINX32-NEXT:    sw s4, 44(sp)
+; ZDINX32-NEXT:    sw s3, 40(sp)
+; ZDINX32-NEXT:    sw s2, 36(sp)
+; ZDINX32-NEXT:    sw s1, 32(sp)
+; ZDINX32-NEXT:    sw t6, 28(sp)
+; ZDINX32-NEXT:    sw t5, 24(sp)
+; ZDINX32-NEXT:    sw t4, 20(sp)
+; ZDINX32-NEXT:    sw t3, 16(sp)
+; ZDINX32-NEXT:    lw t3, 144(sp)
+; ZDINX32-NEXT:    lw t4, 148(sp)
+; ZDINX32-NEXT:    lw t5, 152(sp)
+; ZDINX32-NEXT:    lw t6, 156(sp)
 ; ZDINX32-NEXT:    sw s0, 12(sp)
-; ZDINX32-NEXT:    sw t4, 8(sp)
-; ZDINX32-NEXT:    sw t5, 4(sp)
-; ZDINX32-NEXT:    sw t6, 0(sp)
-; ZDINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    sw t2, 8(sp)
+; ZDINX32-NEXT:    sw t1, 4(sp)
+; ZDINX32-NEXT:    sw t0, 0(sp)
 ; ZDINX32-NEXT:    call callee_float_32
-; ZDINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    addi sp, sp, 160
+; ZDINX32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    addi sp, sp, 144
 ; ZDINX32-NEXT:    ret
 ;
 ; ZDINX64-LABEL: caller_float_32:
@@ -1271,13 +1247,13 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    sd s9, 136(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    sd s10, 128(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    sd s11, 120(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 224(sp)
+; ZDINX64-NEXT:    lw t0, 248(sp)
 ; ZDINX64-NEXT:    sd t0, 112(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 232(sp)
-; ZDINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    lw t0, 240(sp)
+; ZDINX64-NEXT:    sd t0, 104(sp) # 8-byte Folded Spill
+; ZDINX64-NEXT:    lw t0, 232(sp)
 ; ZDINX64-NEXT:    sd t0, 96(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 248(sp)
+; ZDINX64-NEXT:    lw t0, 224(sp)
 ; ZDINX64-NEXT:    sd t0, 88(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    lw t6, 256(sp)
 ; ZDINX64-NEXT:    lw t5, 264(sp)
@@ -1319,10 +1295,10 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    sw t4, 8(sp)
 ; ZDINX64-NEXT:    sw t5, 4(sp)
 ; ZDINX64-NEXT:    sw t6, 0(sp)
-; ZDINX64-NEXT:    ld t3, 112(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t4, 104(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t5, 96(sp) # 8-byte Folded Reload
-; ZDINX64-NEXT:    ld t6, 88(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld t3, 88(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld t4, 96(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld t5, 104(sp) # 8-byte Folded Reload
+; ZDINX64-NEXT:    ld t6, 112(sp) # 8-byte Folded Reload
 ; ZDINX64-NEXT:    call callee_float_32
 ; ZDINX64-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; ZDINX64-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/renamable-copy.mir b/llvm/test/CodeGen/RISCV/renamable-copy.mir
new file mode 100644
index 0000000000000..06f17f4edbccf
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/renamable-copy.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - %s -mtriple=riscv32 -simplify-mir \
+# RUN:   -run-pass=postrapseudos | FileCheck --check-prefix=RV32 %s
+# RUN: llc -o - %s -mtriple=riscv64 -simplify-mir \
+# RUN:   -run-pass=postrapseudos | FileCheck --check-prefix=RV64 %s
+
+--- |
+  define void @foo() {
+  entry:
+    ret void
+  }
+...
+---
+name:            foo
+body:             |
+  bb.0.entry:
+    liveins: $x11
+    ; RV32-LABEL: name: foo
+    ; RV32: liveins: $x11
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: $x10 = ADDI renamable $x11, 0
+    ; RV32-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-LABEL: name: foo
+    ; RV64: liveins: $x11
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: $x10 = ADDI renamable $x11, 0
+    ; RV64-NEXT: PseudoRET implicit $x10
+    renamable $x10 = COPY renamable $x11
+    PseudoRET implicit $x10
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll
new file mode 100644
index 0000000000000..b1250f4804549
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat-bf16.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZFBFMIN-ZVFBFMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVFBFMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZFBFMIN-ZVFBFMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVFBFMIN
+
+define <8 x bfloat> @splat_v8bf16(ptr %x, bfloat %y) {
+; ZFBFMIN-ZVFBFMIN-LABEL: splat_v8bf16:
+; ZFBFMIN-ZVFBFMIN:       # %bb.0:
+; ZFBFMIN-ZVFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vfmv.v.f v10, fa5
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZFBFMIN-ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: splat_v8bf16:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
+; ZVFBFMIN-NEXT:    ret
+  %a = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %b = shufflevector <8 x bfloat> %a, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  ret <8 x bfloat> %b
+}
+
+define <16 x bfloat> @splat_16bf16(ptr %x, bfloat %y) {
+; ZFBFMIN-ZVFBFMIN-LABEL: splat_16bf16:
+; ZFBFMIN-ZVFBFMIN:       # %bb.0:
+; ZFBFMIN-ZVFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vfmv.v.f v12, fa5
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZFBFMIN-ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: splat_16bf16:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
+; ZVFBFMIN-NEXT:    ret
+  %a = insertelement <16 x bfloat> poison, bfloat %y, i32 0
+  %b = shufflevector <16 x bfloat> %a, <16 x bfloat> poison, <16 x i32> zeroinitializer
+  ret <16 x bfloat> %b
+}
+
+define <8 x bfloat> @splat_zero_v8bf16(ptr %x) {
+; ZFBFMIN-ZVFBFMIN-LABEL: splat_zero_v8bf16:
+; ZFBFMIN-ZVFBFMIN:       # %bb.0:
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.i v8, 0
+; ZFBFMIN-ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: splat_zero_v8bf16:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFBFMIN-NEXT:    vmv.v.i v8, 0
+; ZVFBFMIN-NEXT:    ret
+  ret <8 x bfloat> splat (bfloat 0.0)
+}
+
+define <16 x bfloat> @splat_zero_16bf16(ptr %x) {
+; ZFBFMIN-ZVFBFMIN-LABEL: splat_zero_16bf16:
+; ZFBFMIN-ZVFBFMIN:       # %bb.0:
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.i v8, 0
+; ZFBFMIN-ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: splat_zero_16bf16:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFBFMIN-NEXT:    vmv.v.i v8, 0
+; ZVFBFMIN-NEXT:    ret
+  ret <16 x bfloat> splat (bfloat 0.0)
+}
+
+define <8 x bfloat> @splat_negzero_v8bf16(ptr %x) {
+; ZFBFMIN-ZVFBFMIN-LABEL: splat_negzero_v8bf16:
+; ZFBFMIN-ZVFBFMIN:       # %bb.0:
+; ZFBFMIN-ZVFBFMIN-NEXT:    lui a0, 1048568
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.x v8, a0
+; ZFBFMIN-ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: splat_negzero_v8bf16:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    lui a0, 1048568
+; ZVFBFMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
+; ZVFBFMIN-NEXT:    ret
+  ret <8 x bfloat> splat (bfloat -0.0)
+}
+
+define <16 x bfloat> @splat_negzero_16bf16(ptr %x) {
+; ZFBFMIN-ZVFBFMIN-LABEL: splat_negzero_16bf16:
+; ZFBFMIN-ZVFBFMIN:       # %bb.0:
+; ZFBFMIN-ZVFBFMIN-NEXT:    lui a0, 1048568
+; ZFBFMIN-ZVFBFMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZFBFMIN-ZVFBFMIN-NEXT:    vmv.v.x v8, a0
+; ZFBFMIN-ZVFBFMIN-NEXT:    ret
+;
+; ZVFBFMIN-LABEL: splat_negzero_16bf16:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    lui a0, 1048568
+; ZVFBFMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFBFMIN-NEXT:    vmv.v.x v8, a0
+; ZVFBFMIN-NEXT:    ret
+  ret <16 x bfloat> splat (bfloat -0.0)
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-masked-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-masked-vops.ll
index d26fd0ca26c72..3a439cdb996fa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-masked-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-masked-vops.ll
@@ -159,9 +159,8 @@ define <vscale x 2 x i32> @vmerge_larger_vl_same_passthru(<vscale x 2 x i32> %pa
 define <vscale x 2 x i32> @vmerge_smaller_vl_different_passthru(<vscale x 2 x i32> %pt1, <vscale x 2 x i32> %pt2, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %m) {
 ; CHECK-LABEL: vmerge_smaller_vl_different_passthru:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, mu
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, mu
 ; CHECK-NEXT:    vadd.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index 39055dc5adfcf..6700920cebff0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1072,9 +1072,8 @@ define <vscale x 2 x i32> @vmerge_larger_vl_same_passthru(<vscale x 2 x i32> %pa
 define <vscale x 2 x i32> @vmerge_smaller_vl_different_passthru(<vscale x 2 x i32> %pt1, <vscale x 2 x i32> %pt2, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %m) {
 ; CHECK-LABEL: vmerge_smaller_vl_different_passthru:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
-; CHECK-NEXT:    vadd.vv v8, v10, v11
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v8, v0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/basic-global-di.ll b/llvm/test/CodeGen/SPIRV/debug-info/basic-global-di.ll
index 336b7db324c3d..139a8e11182bc 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/basic-global-di.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/basic-global-di.ll
@@ -8,12 +8,12 @@
 ; CHECK-MIR-DAG: [[dwarf_version:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 5
 ; CHECK-MIR-DAG: [[source_language:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 3
 ; CHECK-MIR-DAG: [[debug_info_version:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i64]], 21
-; CHECK-MIR-DAG: [[filename_str:%[0-9]+\:id\(s32\)]] = OpString 1094795567, 1094795585, 792805697, 1111638594, 1111638594, 1128481583, 1128481603, 1697596227, 1886216568, 1663985004, 0
+; CHECK-MIR-DAG: [[filename_str:%[0-9]+\:id\(s32\)]] = OpString 1094795567, 1094795585, 792805697, 1111638594, 1111638594, 1128481583, 1128481603, {{1697596227|1700545347}}, 1886216568, 1663985004, 0
 ; CHECK-MIR-DAG: [[debug_source:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 35, [[filename_str]]
 ; CHECK-MIR-DAG: [[debug_compilation_unit:%[0-9]+\:id\(s32\)]] = OpExtInst [[type_void]], 3, 1, [[source_language]], [[dwarf_version]], [[debug_source]], [[debug_info_version]]
 
 ; CHECK-SPIRV: [[ext_inst_non_semantic:%[0-9]+]] = OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
-; CHECK-SPIRV: [[filename_str:%[0-9]+]] = OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC/example.c"
+; CHECK-SPIRV: [[filename_str:%[0-9]+]] = OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC{{[/\\]}}example.c"
 ; CHECK-SPIRV-DAG: [[type_void:%[0-9]+]] = OpTypeVoid
 ; CHECK-SPIRV-DAG: [[type_i32:%[0-9]+]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: [[dwarf_version:%[0-9]+]] = OpConstant [[type_i32]] 5
@@ -23,7 +23,7 @@
 ; CHECK-SPIRV: [[debug_compiation_unit:%[0-9]+]] = OpExtInst [[type_void]] [[ext_inst_non_semantic]] DebugCompilationUnit [[source_language]] [[dwarf_version]] [[debug_source]] [[debug_info_version]]
 
 ; CHECK-OPTION-NOT: OpExtInstImport "NonSemantic.Shader.DebugInfo.100"
-; CHECK-OPTION-NOT: OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC/example.c"
+; CHECK-OPTION-NOT: OpString "/AAAAAAAAAA/BBBBBBBB/CCCCCCCCC{{[/\\]}}example.c"
 
 define spir_func void @foo() {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/opencl/vload_halfn.ll b/llvm/test/CodeGen/SPIRV/opencl/vload_halfn.ll
new file mode 100644
index 0000000000000..abfae74afb659
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/vload_halfn.ll
@@ -0,0 +1,15 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#IMPORT:]] = OpExtInstImport "OpenCL.std"
+
+; CHECK: %[[#FLOAT:]] = OpTypeFloat 32
+; CHECK: %[[#V2FLOAT:]] = OpTypeVector %[[#FLOAT]] 2
+
+define void @test(i64 %a, ptr addrspace(1) %b) {
+; CHECK: %[[#]] = OpExtInst %[[#V2FLOAT:]] %[[#IMPORT]] vload_halfn %[[#]] %[[#]] 2
+  %c = call spir_func <2 x float> @_Z11vload_half2mPU3AS1KDh(i64 %a, ptr addrspace(1) %b)
+  ret void
+}
+
+declare <2 x float> @_Z11vload_half2mPU3AS1KDh(i64, ptr addrspace(1))
diff --git a/llvm/test/CodeGen/X86/code-align-loops.ll b/llvm/test/CodeGen/X86/code-align-loops.ll
index 3823293d747c7..616478993a51d 100644
--- a/llvm/test/CodeGen/X86/code-align-loops.ll
+++ b/llvm/test/CodeGen/X86/code-align-loops.ll
@@ -96,6 +96,56 @@ for.body5:                                        ; preds = %for.body, %for.body
   br i1 %exitcond16.not, label %for.cond.cleanup4, label %for.body5, !llvm.loop !2
 }
 
+; test3 is to check if .p2align can be correctly set on loops with multi latches.
+; The test IR is generated from below simple C file:
+; $ clang -O0 -S -emit-llvm loop.c
+; $ cat loop.c
+; int test3() {
+;     int i = 0;
+;     [[clang::code_align(32)]]
+;     while (i < 10) {
+;         if (i % 2) {
+;             continue;
+;         }
+;         i++;
+;     }
+; }
+; CHECK-LABEL: test3_multilatch:
+; ALIGN: .p2align 4, 0x90
+; ALIGN-NEXT: .LBB2_1: # %while.cond
+define dso_local i32 @test3_multilatch() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  store i32 0, ptr %i, align 4
+  br label %while.cond
+
+while.cond:                                       ; preds = %if.end, %if.then, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %while.body, label %while.end
+
+while.body:                                       ; preds = %while.cond
+  %1 = load i32, ptr %i, align 4
+  %rem = srem i32 %1, 2
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  br label %while.cond, !llvm.loop !0
+
+if.end:                                           ; preds = %while.body
+  %2 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, ptr %i, align 4
+  br label %while.cond, !llvm.loop !0
+
+while.end:                                        ; preds = %while.cond
+  %3 = load i32, ptr %retval, align 4
+  ret i32 %3
+}
+
 declare void @bar()
 declare void @var()
 
diff --git a/llvm/test/CodeGen/Xtensa/load.ll b/llvm/test/CodeGen/Xtensa/load.ll
new file mode 100644
index 0000000000000..2f730f56eb1f5
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/load.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=xtensa < %s | FileCheck %s
+
+define signext i8 @test_load_i8(ptr %p){
+; CHECK-LABEL: test_load_i8:
+; CHECK:         l8ui a8, a2, 0
+; CHECK-NEXT:    slli a8, a8, 24
+; CHECK-NEXT:    srai a2, a8, 24
+; CHECK-NEXT:    ret
+  %1 = load i8, ptr %p, align 1
+  ret i8 %1
+}
diff --git a/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_nosanitize.ll b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_nosanitize.ll
new file mode 100644
index 0000000000000..5a465df749da6
--- /dev/null
+++ b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_nosanitize.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -passes=rtsan -S | FileCheck %s
+
+define void @nosanitized_function() #0 {
+  %1 = alloca ptr, align 8
+  %2 = call ptr @malloc(i64 noundef 2) #3
+  store ptr %2, ptr %1, align 8
+  ret void
+}
+
+declare ptr @malloc(i64 noundef) #1
+
+define noundef i32 @main() #2 {
+  %1 = alloca i32, align 4
+  store i32 0, ptr %1, align 4
+  call void @nosanitized_function() #4
+  ret i32 0
+}
+
+attributes #0 = { nosanitize_realtime }
+
+; CHECK-LABEL: @nosanitized_function()
+; CHECK-NEXT: call{{.*}}@__rtsan_off
+
+; CHECK: call{{.*}}@__rtsan_on
+; CHECK-NEXT: ret{{.*}}void
diff --git a/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll b/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll
new file mode 100644
index 0000000000000..63366ba998c7b
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/actual-arguments.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes
+; RUN: opt -S -passes=argpromotion < %s | FileCheck %s
+
+; In the following tests, the call to @callee may invalidate ptr %test_c and so
+; prohibit removing loads of %test_c following the call, preventing Argument
+; Promotion of %test_c in the general case.
+
+; This is called by @caller_ptr_args, from which we cannot prove anything about
+; whether %test_c may alias %p and so we cannot promote %test_c.
+;
+define internal i32 @test_cannot_promote_1(ptr %p, ptr nocapture readonly %test_c) {
+; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_1
+; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) {
+; CHECK-NEXT:    [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]])
+; CHECK-NEXT:    [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]]
+; CHECK-NEXT:    ret i32 [[SUM]]
+;
+  %res = call i32 @callee(ptr %p, ptr %test_c)
+
+  %ltest_c = load i32, ptr %test_c
+
+  %sum = add i32 %ltest_c, %res
+
+  ret i32 %sum
+}
+
+; This is called by @caller_aliased_args, from which we can see that %test_c may
+; alias %p and so we cannot promote %test_c.
+;
+define internal i32 @test_cannot_promote_2(ptr %p, ptr nocapture readonly %test_c) {
+; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_2
+; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) {
+; CHECK-NEXT:    [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]])
+; CHECK-NEXT:    [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]]
+; CHECK-NEXT:    ret i32 [[SUM]]
+;
+  %res = call i32 @callee(ptr %p, ptr %test_c)
+
+  %ltest_c = load i32, ptr %test_c
+
+  %sum = add i32 %ltest_c, %res
+
+  ret i32 %sum
+}
+
+; This is called by @caller_safe_args_1, but also from @caller_aliased_args, so
+; we cannot promote %test_c.
+;
+define internal i32 @test_cannot_promote_3(ptr %p, ptr nocapture readonly %test_c) {
+; CHECK-LABEL: define {{[^@]+}}@test_cannot_promote_3
+; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) {
+; CHECK-NEXT:    [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]])
+; CHECK-NEXT:    [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]]
+; CHECK-NEXT:    ret i32 [[SUM]]
+;
+  %res = call i32 @callee(ptr %p, ptr %test_c)
+
+  %ltest_c = load i32, ptr %test_c
+
+  %sum = add i32 %ltest_c, %res
+
+  ret i32 %sum
+}
+
+; FIXME: We should perform ArgPromotion here!
+;
+; This is called only by @caller_safe_args_1, from which we can prove that
+; %test_c does not alias %p for any Call to the function, so we can promote it.
+;
+define internal i32 @test_can_promote_1(ptr %p, ptr nocapture readonly %test_c) {
+; CHECK-LABEL: define {{[^@]+}}@test_can_promote_1
+; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) {
+; CHECK-NEXT:    [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]])
+; CHECK-NEXT:    [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]]
+; CHECK-NEXT:    ret i32 [[SUM]]
+;
+  %res = call i32 @callee(ptr %p, ptr %test_c)
+
+  %ltest_c = load i32, ptr %test_c
+
+  %sum = add i32 %ltest_c, %res
+
+  ret i32 %sum
+}
+
+; FIXME: We should perform ArgPromotion here!
+;
+; This is called by multiple callers (@caller_safe_args_1, @caller_safe_args_2),
+; from which we can prove that %test_c does not alias %p for any Call to the
+; function, so we can promote it.
+;
+define internal i32 @test_can_promote_2(ptr %p, ptr nocapture readonly %test_c) {
+; CHECK-LABEL: define {{[^@]+}}@test_can_promote_2
+; CHECK-SAME: (ptr [[P:%.*]], ptr nocapture readonly [[TEST_C:%.*]]) {
+; CHECK-NEXT:    [[TEST_C_VAL:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @callee(ptr [[P]], i32 [[TEST_C_VAL]])
+; CHECK-NEXT:    [[LTEST_C:%.*]] = load i32, ptr [[TEST_C]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[LTEST_C]], [[RES]]
+; CHECK-NEXT:    ret i32 [[SUM]]
+;
+  %res = call i32 @callee(ptr %p, ptr %test_c)
+
+  %ltest_c = load i32, ptr %test_c
+
+  %sum = add i32 %ltest_c, %res
+
+  ret i32 %sum
+}
+
+; Called by @test_XXX
+define internal i32 @callee(ptr %p, ptr nocapture readonly %callee_c) {
+; CHECK-LABEL: define {{[^@]+}}@callee
+; CHECK-SAME: (ptr [[P:%.*]], i32 [[CALLEE_C_0_VAL:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[A]], [[CALLEE_C_0_VAL]]
+; CHECK-NEXT:    store i32 [[SUM]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[SUM]]
+;
+  %a = load i32, ptr %p
+
+  %lcallee_c = load i32, ptr %callee_c
+
+  %sum = add i32 %a, %lcallee_c
+
+  store i32 %sum, ptr %p
+
+  ret i32 %sum
+}
+
+; Calls @test_cannot_promote_1
+define i32 @caller_ptr_args(i64 %n, ptr %p1, ptr %p2) {
+; CHECK-LABEL: define {{[^@]+}}@caller_ptr_args
+; CHECK-SAME: (i64 [[N:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    call void @memset(ptr [[P1]], i64 0, i64 [[N]])
+; CHECK-NEXT:    store i32 5, ptr [[P2]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @test_cannot_promote_1(ptr [[P1]], ptr [[P2]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  call void @memset(ptr %p1, i64 0, i64 %n)
+
+  store i32 5, ptr %p2
+
+  %res = call i32 @test_cannot_promote_1(ptr %p1, ptr %p2)
+
+  ret i32 %res
+}
+
+; Calls @test_cannot_promote_2
+; Calls @test_cannot_promote_3
+define i32 @caller_aliased_args() {
+; CHECK-LABEL: define {{[^@]+}}@caller_aliased_args() {
+; CHECK-NEXT:    [[CALLER_C:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 5, ptr [[CALLER_C]], align 4
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @test_cannot_promote_2(ptr [[CALLER_C]], ptr [[CALLER_C]])
+; CHECK-NEXT:    [[RES2:%.*]] = call i32 @test_cannot_promote_3(ptr [[CALLER_C]], ptr [[CALLER_C]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RES1]], [[RES2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %caller_c = alloca i32
+  store i32 5, ptr %caller_c
+
+  %res1 = call i32 @test_cannot_promote_2(ptr %caller_c, ptr %caller_c)
+  %res2 = call i32 @test_cannot_promote_3(ptr %caller_c, ptr %caller_c)
+
+  %res = add i32 %res1, %res2
+
+  ret i32 %res
+}
+
+; Calls @test_cannot_promote_3
+; Calls @test_can_promote_1
+; Calls @test_can_promote_2
+define i32 @caller_safe_args_1(i64 %n) {
+; CHECK-LABEL: define {{[^@]+}}@caller_safe_args_1
+; CHECK-SAME: (i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[P:%.*]] = alloca [5 x double], i64 [[N]], align 8
+; CHECK-NEXT:    call void @memset(ptr [[P]], i64 0, i64 [[N]])
+; CHECK-NEXT:    [[CALLER_C:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 5, ptr [[CALLER_C]], align 4
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @test_cannot_promote_3(ptr [[P]], ptr [[CALLER_C]])
+; CHECK-NEXT:    [[RES2:%.*]] = call i32 @test_can_promote_1(ptr [[P]], ptr [[CALLER_C]])
+; CHECK-NEXT:    [[RES3:%.*]] = call i32 @test_can_promote_2(ptr [[P]], ptr [[CALLER_C]])
+; CHECK-NEXT:    [[RES12:%.*]] = add i32 [[RES1]], [[RES2]]
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RES12]], [[RES3]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %p = alloca [5 x double], i64 %n
+  call void @memset(ptr %p, i64 0, i64 %n)
+
+  %caller_c = alloca i32
+  store i32 5, ptr %caller_c
+
+  %res1 = call i32 @test_cannot_promote_3(ptr %p, ptr %caller_c)
+  %res2 = call i32 @test_can_promote_1(ptr %p, ptr %caller_c)
+  %res3 = call i32 @test_can_promote_2(ptr %p, ptr %caller_c)
+
+  %res12 = add i32 %res1, %res2
+  %res = add i32 %res12, %res3
+
+  ret i32 %res
+}
+
+; Calls @test_can_promote_2
+define i32 @caller_safe_args_2(i64 %n, ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@caller_safe_args_2
+; CHECK-SAME: (i64 [[N:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @memset(ptr [[P]], i64 0, i64 [[N]])
+; CHECK-NEXT:    [[CALLER_C:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 5, ptr [[CALLER_C]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @test_can_promote_2(ptr [[P]], ptr [[CALLER_C]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  call void @memset(ptr %p, i64 0, i64 %n)
+
+  %caller_c = alloca i32
+  store i32 5, ptr %caller_c
+
+  %res = call i32 @test_can_promote_2(ptr %p, ptr %caller_c)
+
+  ret i32 %res
+}
+
+declare void @memset(ptr, i64, i64)
diff --git a/llvm/test/Transforms/Coroutines/coro-pgo-setbranchweights.ll b/llvm/test/Transforms/Coroutines/coro-pgo-setbranchweights.ll
new file mode 100644
index 0000000000000..4f5f936606ca3
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-pgo-setbranchweights.ll
@@ -0,0 +1,42 @@
+; RUN: rm -rf %t && split-file %s %t
+
+; RUN: llvm-profdata merge %t/a.proftext -o %t/a.profdata
+; RUN: opt < %t/a.ll --passes=pgo-instr-use -pgo-test-profile-file=%t/a.profdata
+
+;--- a.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-redhat-linux-gnu"
+
+define void @_bar() presplitcoroutine personality ptr null {
+  %1 = call token @llvm.coro.save(ptr null)
+  %2 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %2, label %5 [
+    i8 0, label %3
+    i8 1, label %4
+  ]
+
+3:                                                ; preds = %0
+  ret void
+
+4:                                                ; preds = %0
+  ret void
+
+5:                                                ; preds = %0
+  ret void
+}
+
+declare token @llvm.coro.save(ptr)
+
+declare i8 @llvm.coro.suspend(token, i1)
+
+;--- a.proftext
+# IR level Instrumentation Flag
+:ir
+
+_bar
+# Func Hash:
+1063705160175073211
+# Num Counters:
+2
+1
+0
diff --git a/llvm/test/Transforms/Inline/ML/lit.local.cfg b/llvm/test/Transforms/Inline/ML/lit.local.cfg
deleted file mode 100644
index e8c7912650cb8..0000000000000
--- a/llvm/test/Transforms/Inline/ML/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-import sys
-
-config.unsupported = sys.version_info.minor <= 8
diff --git a/llvm/test/Transforms/InstCombine/memchr-7.ll b/llvm/test/Transforms/InstCombine/memchr-7.ll
index 0b364cce656d7..61f1093279f83 100644
--- a/llvm/test/Transforms/InstCombine/memchr-7.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-7.ll
@@ -12,11 +12,12 @@ declare ptr @memchr(ptr, i32, i64)
 
 define zeroext i1 @strchr_to_memchr_n_equals_len(i32 %c) {
 ; CHECK-LABEL: @strchr_to_memchr_n_equals_len(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[C:%.*]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[C]], -97
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 26
-; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    ret i1 [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], -97
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[TMP3]], 26
+; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %call = tail call ptr @strchr(ptr nonnull dereferenceable(27) @.str, i32 %c)
   %cmp = icmp ne ptr %call, null
@@ -38,9 +39,10 @@ define zeroext i1 @memchr_n_equals_len(i32 %c) {
 
 define zeroext i1 @memchr_n_less_than_len(i32 %c) {
 ; CHECK-LABEL: @memchr_n_less_than_len(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[C:%.*]], -97
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 15
-; CHECK-NEXT:    ret i1 [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], -97
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i8 [[TMP2]], 15
+; CHECK-NEXT:    ret i1 [[TMP3]]
 ;
   %call = tail call ptr @memchr(ptr @.str, i32 %c, i64 15)
   %cmp = icmp ne ptr %call, null
@@ -50,11 +52,12 @@ define zeroext i1 @memchr_n_less_than_len(i32 %c) {
 
 define zeroext i1 @memchr_n_more_than_len(i32 %c) {
 ; CHECK-LABEL: @memchr_n_more_than_len(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[C:%.*]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[C]], -97
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 26
-; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    ret i1 [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], -97
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[TMP3]], 26
+; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %call = tail call ptr @memchr(ptr @.str, i32 %c, i64 30)
   %cmp = icmp ne ptr %call, null
@@ -114,12 +117,13 @@ define zeroext i1 @memchr_n_equals_len2_minsize(i32 %c) minsize {
 ; Positive test - 2 non-contiguous ranges
 define zeroext i1 @strchr_to_memchr_2_non_cont_ranges(i32 %c) {
 ; CHECK-LABEL: @strchr_to_memchr_2_non_cont_ranges(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[C:%.*]], -97
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 6
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[C]], -109
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    ret i1 [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], -97
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i8 [[TMP2]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP1]], -109
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i8 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    ret i1 [[TMP6]]
 ;
   %call = tail call ptr @memchr(ptr @.str.2, i32 %c, i64 9)
   %cmp = icmp ne ptr %call, null
@@ -129,12 +133,13 @@ define zeroext i1 @strchr_to_memchr_2_non_cont_ranges(i32 %c) {
 ; Positive test - 2 non-contiguous ranges with char duplication
 define zeroext i1 @strchr_to_memchr_2_non_cont_ranges_char_dup(i32 %c) {
 ; CHECK-LABEL: @strchr_to_memchr_2_non_cont_ranges_char_dup(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[C:%.*]], -97
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[C]], -109
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    ret i1 [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], -97
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i8 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP1]], -109
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i8 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    ret i1 [[TMP6]]
 ;
   %call = tail call ptr @memchr(ptr @.str.4, i32 %c, i64 6)
   %cmp = icmp ne ptr %call, null
diff --git a/llvm/test/Transforms/LoopVectorize/global_alias.ll b/llvm/test/Transforms/LoopVectorize/global_alias.ll
index e0f12fb3b0279..336e462a4cf63 100644
--- a/llvm/test/Transforms/LoopVectorize/global_alias.ll
+++ b/llvm/test/Transforms/LoopVectorize/global_alias.ll
@@ -503,7 +503,7 @@ for.end:                                          ; preds = %for.body
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @mayAlias01(
-; CHECK: add nsw <4 x i32>
+; CHECK-NOT: add nsw <4 x i32>
 ; CHECK: ret
 
 define i32 @mayAlias01(i32 %a) nounwind {
@@ -536,7 +536,7 @@ for.end:                                          ; preds = %for.body
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @mayAlias02(
-; CHECK: add nsw <4 x i32>
+; CHECK-NOT: add nsw <4 x i32>
 ; CHECK: ret
 
 define i32 @mayAlias02(i32 %a) nounwind {
diff --git a/llvm/test/Transforms/SCCP/pointer-nonnull.ll b/llvm/test/Transforms/SCCP/pointer-nonnull.ll
new file mode 100644
index 0000000000000..cd04c1c2d39d9
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/pointer-nonnull.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=sccp < %s | FileCheck %s
+
+declare ptr @get()
+
+define i1 @test_no_attr(ptr %p) {
+; CHECK-LABEL: define i1 @test_no_attr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[P]], null
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp ne ptr %p, null
+  ret i1 %cmp
+}
+
+define i1 @test_nonnull(ptr nonnull %p) {
+; CHECK-LABEL: define i1 @test_nonnull(
+; CHECK-SAME: ptr nonnull [[P:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ne ptr %p, null
+  ret i1 %cmp
+}
+
+define i1 @test_nonnull_eq(ptr nonnull %p) {
+; CHECK-LABEL: define i1 @test_nonnull_eq(
+; CHECK-SAME: ptr nonnull [[P:%.*]]) {
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp eq ptr %p, null
+  ret i1 %cmp
+}
+
+define i1 @test_dereferenceable(ptr dereferenceable(4) %p) {
+; CHECK-LABEL: define i1 @test_dereferenceable(
+; CHECK-SAME: ptr dereferenceable(4) [[P:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ne ptr %p, null
+  ret i1 %cmp
+}
+
+define i1 @test_alloca() {
+; CHECK-LABEL: define i1 @test_alloca() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    ret i1 true
+;
+  %a = alloca i32
+  %cmp = icmp ne ptr %a, null
+  ret i1 %cmp
+}
+
+define i1 @test_alloca_addrspace() {
+; CHECK-LABEL: define i1 @test_alloca_addrspace() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4, addrspace(1)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr addrspace(1) [[A]], null
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = alloca i32, addrspace(1)
+  %cmp = icmp ne ptr addrspace(1) %a, null
+  ret i1 %cmp
+}
+
+define i1 @test_alloca_null_pointer_valid() null_pointer_is_valid {
+; CHECK-LABEL: define i1 @test_alloca_null_pointer_valid(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[A]], null
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = alloca i32
+  %cmp = icmp ne ptr %a, null
+  ret i1 %cmp
+}
+
+define i1 @test_load_nonnull(ptr %p) {
+; CHECK-LABEL: define i1 @test_load_nonnull(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr [[P]], align 8, !nonnull [[META0:![0-9]+]]
+; CHECK-NEXT:    ret i1 true
+;
+  %p2 = load ptr, ptr %p, !nonnull !{}
+  %cmp = icmp ne ptr %p2, null
+  ret i1 %cmp
+}
+
+define i1 @test_call_nonnull() {
+; CHECK-LABEL: define i1 @test_call_nonnull() {
+; CHECK-NEXT:    [[P:%.*]] = call nonnull ptr @get()
+; CHECK-NEXT:    ret i1 true
+;
+  %p = call nonnull ptr @get()
+  %cmp = icmp ne ptr %p, null
+  ret i1 %cmp
+}
+
+define i1 @test_call_dereferenceable() {
+; CHECK-LABEL: define i1 @test_call_dereferenceable() {
+; CHECK-NEXT:    [[P:%.*]] = call dereferenceable(4) ptr @get()
+; CHECK-NEXT:    ret i1 true
+;
+  %p = call dereferenceable(4) ptr @get()
+  %cmp = icmp ne ptr %p, null
+  ret i1 %cmp
+}
+
+define i1 @test_gep_no_flags(ptr nonnull %p, i64 %x) {
+; CHECK-LABEL: define i1 @test_gep_no_flags(
+; CHECK-SAME: ptr nonnull [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[GEP]], null
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr i8, ptr %p, i64 %x
+  %cmp = icmp ne ptr %gep, null
+  ret i1 %cmp
+}
+
+define i1 @test_gep_nuw(ptr nonnull %p, i64 %x) {
+; CHECK-LABEL: define i1 @test_gep_nuw(
+; CHECK-SAME: ptr nonnull [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    ret i1 true
+;
+  %gep = getelementptr nuw i8, ptr %p, i64 %x
+  %cmp = icmp ne ptr %gep, null
+  ret i1 %cmp
+}
+
+define i1 @test_gep_inbounds(ptr nonnull %p, i64 %x) {
+; CHECK-LABEL: define i1 @test_gep_inbounds(
+; CHECK-SAME: ptr nonnull [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    ret i1 true
+;
+  %gep = getelementptr inbounds i8, ptr %p, i64 %x
+  %cmp = icmp ne ptr %gep, null
+  ret i1 %cmp
+}
+
+define i1 @test_gep_inbounds_null_pointer_valid(ptr nonnull %p, i64 %x) null_pointer_is_valid {
+; CHECK-LABEL: define i1 @test_gep_inbounds_null_pointer_valid(
+; CHECK-SAME: ptr nonnull [[P:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[GEP]], null
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr inbounds i8, ptr %p, i64 %x
+  %cmp = icmp ne ptr %gep, null
+  ret i1 %cmp
+}
+
+define i1 @test_select(i1 %c, ptr nonnull %p, i64 %x) {
+; CHECK-LABEL: define i1 @test_select(
+; CHECK-SAME: i1 [[C:%.*]], ptr nonnull [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C]], ptr [[P]], ptr [[GEP]]
+; CHECK-NEXT:    ret i1 true
+;
+  %gep = getelementptr nuw i8, ptr %p, i64 %x
+  %sel = select i1 %c, ptr %p, ptr %gep
+  %cmp = icmp ne ptr %sel, null
+  ret i1 %cmp
+}
+
+define i1 @test_select_not_nuw(i1 %c, ptr nonnull %p, i64 %x) {
+; CHECK-LABEL: define i1 @test_select_not_nuw(
+; CHECK-SAME: i1 [[C:%.*]], ptr nonnull [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C]], ptr [[P]], ptr [[GEP]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[SEL]], null
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr i8, ptr %p, i64 %x
+  %sel = select i1 %c, ptr %p, ptr %gep
+  %cmp = icmp ne ptr %sel, null
+  ret i1 %cmp
+}
+
+define i1 @test_phi(i1 %c, ptr nonnull %p, i64 %x) {
+; CHECK-LABEL: define i1 @test_phi(
+; CHECK-SAME: i1 [[C:%.*]], ptr nonnull [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[P]], %[[ENTRY]] ], [ [[GEP]], %[[IF]] ]
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  br i1 %c, label %if, label %join
+
+if:
+  %gep = getelementptr nuw i8, ptr %p, i64 %x
+  br label %join
+
+join:
+  %phi = phi ptr [ %p, %entry ], [ %gep, %if ]
+  %cmp = icmp ne ptr %phi, null
+  ret i1 %cmp
+}
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/long-mask-split.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/long-mask-split.ll
new file mode 100644
index 0000000000000..cb07090c601f6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/long-mask-split.ll
@@ -0,0 +1,438 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+
+define i32 @test() {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr null, i64 16), align 8
+; CHECK-NEXT:    [[SHR_1_I:%.*]] = lshr i64 0, 0
+; CHECK-NEXT:    [[SHR_1_I_13:%.*]] = lshr i64 0, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[SHR_1_I_13]] to i8
+; CHECK-NEXT:    [[STOREDV_1_I_13:%.*]] = and i8 0, [[TMP1]]
+; CHECK-NEXT:    [[SHR_1_I_14:%.*]] = lshr i64 0, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[SHR_1_I_14]] to i8
+; CHECK-NEXT:    [[STOREDV_1_I_14:%.*]] = and i8 [[STOREDV_1_I_13]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr null, i64 32), align 8
+; CHECK-NEXT:    [[SHR_2_I:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[SHR_2_I]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I:%.*]] = and i8 [[STOREDV_1_I_14]], [[TMP4]]
+; CHECK-NEXT:    [[SHR_2_I_1:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[SHR_2_I_1]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_1:%.*]] = and i8 [[STOREDV_2_I]], [[TMP5]]
+; CHECK-NEXT:    [[SHR_2_I_2:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[SHR_2_I_2]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_2:%.*]] = and i8 [[STOREDV_2_I_1]], [[TMP6]]
+; CHECK-NEXT:    [[SHR_2_I_3:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[SHR_2_I_3]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_3:%.*]] = and i8 [[STOREDV_2_I_2]], [[TMP7]]
+; CHECK-NEXT:    [[SHR_2_I_4:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[SHR_2_I_4]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_4:%.*]] = and i8 [[STOREDV_2_I_3]], [[TMP8]]
+; CHECK-NEXT:    [[SHR_2_I_5:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[SHR_2_I_5]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_5:%.*]] = and i8 [[STOREDV_2_I_4]], [[TMP9]]
+; CHECK-NEXT:    [[SHR_2_I_6:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[SHR_2_I_6]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_6:%.*]] = and i8 [[STOREDV_2_I_5]], [[TMP10]]
+; CHECK-NEXT:    [[SHR_2_I_7:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[SHR_2_I_7]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_7:%.*]] = and i8 [[STOREDV_2_I_6]], [[TMP11]]
+; CHECK-NEXT:    [[STOREDV_2_I_8:%.*]] = and i8 [[STOREDV_2_I_7]], 0
+; CHECK-NEXT:    [[SHR_2_I_9:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[SHR_2_I_9]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_9:%.*]] = and i8 [[STOREDV_2_I_8]], [[TMP12]]
+; CHECK-NEXT:    [[SHR_2_I_10:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[SHR_2_I_10]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_10:%.*]] = and i8 [[STOREDV_2_I_9]], [[TMP13]]
+; CHECK-NEXT:    [[SHR_2_I_11:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[SHR_2_I_11]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_11:%.*]] = and i8 [[STOREDV_2_I_10]], [[TMP14]]
+; CHECK-NEXT:    [[SHR_2_I_12:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i64 [[SHR_2_I_12]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_12:%.*]] = and i8 [[STOREDV_2_I_11]], [[TMP15]]
+; CHECK-NEXT:    [[SHR_2_I_13:%.*]] = lshr i64 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[SHR_2_I_13]] to i8
+; CHECK-NEXT:    [[STOREDV_2_I_13:%.*]] = and i8 [[STOREDV_2_I_12]], [[TMP16]]
+; CHECK-NEXT:    [[STOREDV_2_I_14:%.*]] = and i8 [[STOREDV_2_I_13]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr getelementptr (i8, ptr null, i64 48), align 8
+; CHECK-NEXT:    [[SHR_3_I:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[SHR_3_I]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I:%.*]] = and i8 [[STOREDV_2_I_14]], [[TMP18]]
+; CHECK-NEXT:    [[SHR_3_I_1:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[SHR_3_I_1]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_1:%.*]] = and i8 [[STOREDV_3_I]], [[TMP19]]
+; CHECK-NEXT:    [[STOREDV_3_I_2:%.*]] = and i8 [[STOREDV_3_I_1]], 0
+; CHECK-NEXT:    [[SHR_3_I_3:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[SHR_3_I_3]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_3:%.*]] = and i8 [[STOREDV_3_I_2]], [[TMP20]]
+; CHECK-NEXT:    [[SHR_3_I_4:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[SHR_3_I_4]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_4:%.*]] = and i8 [[STOREDV_3_I_3]], [[TMP21]]
+; CHECK-NEXT:    [[SHR_3_I_5:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc i64 [[SHR_3_I_5]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_5:%.*]] = and i8 [[STOREDV_3_I_4]], [[TMP22]]
+; CHECK-NEXT:    [[STOREDV_3_I_6:%.*]] = and i8 [[STOREDV_3_I_5]], 0
+; CHECK-NEXT:    [[SHR_3_I_7:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[SHR_3_I_7]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_7:%.*]] = and i8 [[STOREDV_3_I_6]], [[TMP23]]
+; CHECK-NEXT:    [[SHR_3_I_8:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[SHR_3_I_8]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_8:%.*]] = and i8 [[STOREDV_3_I_7]], [[TMP24]]
+; CHECK-NEXT:    [[SHR_3_I_9:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[SHR_3_I_9]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_9:%.*]] = and i8 [[STOREDV_3_I_8]], [[TMP25]]
+; CHECK-NEXT:    [[SHR_3_I_10:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[SHR_3_I_10]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_10:%.*]] = and i8 [[STOREDV_3_I_9]], [[TMP26]]
+; CHECK-NEXT:    [[SHR_3_I_11:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[SHR_3_I_11]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_11:%.*]] = and i8 [[STOREDV_3_I_10]], [[TMP27]]
+; CHECK-NEXT:    [[STOREDV_3_I_12:%.*]] = and i8 [[STOREDV_3_I_11]], 0
+; CHECK-NEXT:    [[SHR_3_I_13:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[SHR_3_I_13]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_13:%.*]] = and i8 [[STOREDV_3_I_12]], [[TMP28]]
+; CHECK-NEXT:    [[SHR_3_I_14:%.*]] = lshr i64 0, [[TMP17]]
+; CHECK-NEXT:    [[TMP29:%.*]] = trunc i64 [[SHR_3_I_14]] to i8
+; CHECK-NEXT:    [[STOREDV_3_I_14:%.*]] = and i8 [[STOREDV_3_I_13]], [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[STOREDV_4_I:%.*]] = and i8 [[STOREDV_3_I_14]], 0
+; CHECK-NEXT:    [[SHR_4_I_1:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = trunc i64 [[SHR_4_I_1]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_1:%.*]] = and i8 [[STOREDV_4_I]], [[TMP31]]
+; CHECK-NEXT:    [[SHR_4_I_2:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = trunc i64 [[SHR_4_I_2]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_2:%.*]] = and i8 [[STOREDV_4_I_1]], [[TMP32]]
+; CHECK-NEXT:    [[SHR_4_I_3:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = trunc i64 [[SHR_4_I_3]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_3:%.*]] = and i8 [[STOREDV_4_I_2]], [[TMP33]]
+; CHECK-NEXT:    [[SHR_4_I_4:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc i64 [[SHR_4_I_4]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_4:%.*]] = and i8 [[STOREDV_4_I_3]], [[TMP34]]
+; CHECK-NEXT:    [[STOREDV_4_I_5:%.*]] = and i8 [[STOREDV_4_I_4]], 0
+; CHECK-NEXT:    [[SHR_4_I_6:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP35:%.*]] = trunc i64 [[SHR_4_I_6]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_6:%.*]] = and i8 [[STOREDV_4_I_5]], [[TMP35]]
+; CHECK-NEXT:    [[SHR_4_I_7:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP36:%.*]] = trunc i64 [[SHR_4_I_7]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_7:%.*]] = and i8 [[STOREDV_4_I_6]], [[TMP36]]
+; CHECK-NEXT:    [[SHR_4_I_8:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP37:%.*]] = trunc i64 [[SHR_4_I_8]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_8:%.*]] = and i8 [[STOREDV_4_I_7]], [[TMP37]]
+; CHECK-NEXT:    [[SHR_4_I_9:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP38:%.*]] = trunc i64 [[SHR_4_I_9]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_9:%.*]] = and i8 [[STOREDV_4_I_8]], [[TMP38]]
+; CHECK-NEXT:    [[SHR_4_I_10:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP39:%.*]] = trunc i64 [[SHR_4_I_10]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_10:%.*]] = and i8 [[STOREDV_4_I_9]], [[TMP39]]
+; CHECK-NEXT:    [[STOREDV_4_I_11:%.*]] = and i8 [[STOREDV_4_I_10]], 0
+; CHECK-NEXT:    [[SHR_4_I_12:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc i64 [[SHR_4_I_12]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_12:%.*]] = and i8 [[STOREDV_4_I_11]], [[TMP40]]
+; CHECK-NEXT:    [[SHR_4_I_13:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP41:%.*]] = trunc i64 [[SHR_4_I_13]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_13:%.*]] = and i8 [[STOREDV_4_I_12]], [[TMP41]]
+; CHECK-NEXT:    [[SHR_4_I_14:%.*]] = lshr i64 0, [[TMP30]]
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[SHR_4_I_14]] to i8
+; CHECK-NEXT:    [[STOREDV_4_I_14:%.*]] = and i8 [[STOREDV_4_I_13]], [[TMP42]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i64, ptr getelementptr (i8, ptr null, i64 80), align 8
+; CHECK-NEXT:    [[SHR_5_I:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[SHR_5_I]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I:%.*]] = and i8 [[STOREDV_4_I_14]], [[TMP44]]
+; CHECK-NEXT:    [[SHR_5_I_1:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc i64 [[SHR_5_I_1]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_1:%.*]] = and i8 [[STOREDV_5_I]], [[TMP45]]
+; CHECK-NEXT:    [[SHR_5_I_2:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[SHR_5_I_2]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_2:%.*]] = and i8 [[STOREDV_5_I_1]], [[TMP46]]
+; CHECK-NEXT:    [[STOREDV_5_I_3:%.*]] = and i8 [[STOREDV_5_I_2]], 0
+; CHECK-NEXT:    [[SHR_5_I_4:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP47:%.*]] = trunc i64 [[SHR_5_I_4]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_4:%.*]] = and i8 [[STOREDV_5_I_3]], [[TMP47]]
+; CHECK-NEXT:    [[SHR_5_I_5:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP48:%.*]] = trunc i64 [[SHR_5_I_5]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_5:%.*]] = and i8 [[STOREDV_5_I_4]], [[TMP48]]
+; CHECK-NEXT:    [[SHR_5_I_6:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[SHR_5_I_6]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_6:%.*]] = and i8 [[STOREDV_5_I_5]], [[TMP49]]
+; CHECK-NEXT:    [[SHR_5_I_7:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP50:%.*]] = trunc i64 [[SHR_5_I_7]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_7:%.*]] = and i8 [[STOREDV_5_I_6]], [[TMP50]]
+; CHECK-NEXT:    [[SHR_5_I_8:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[SHR_5_I_8]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_8:%.*]] = and i8 [[STOREDV_5_I_7]], [[TMP51]]
+; CHECK-NEXT:    [[STOREDV_5_I_9:%.*]] = and i8 [[STOREDV_5_I_8]], 0
+; CHECK-NEXT:    [[SHR_5_I_10:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP52:%.*]] = trunc i64 [[SHR_5_I_10]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_10:%.*]] = and i8 [[STOREDV_5_I_9]], [[TMP52]]
+; CHECK-NEXT:    [[SHR_5_I_11:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[SHR_5_I_11]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_11:%.*]] = and i8 [[STOREDV_5_I_10]], [[TMP53]]
+; CHECK-NEXT:    [[SHR_5_I_12:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP54:%.*]] = trunc i64 [[SHR_5_I_12]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_12:%.*]] = and i8 [[STOREDV_5_I_11]], [[TMP54]]
+; CHECK-NEXT:    [[SHR_5_I_13:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP55:%.*]] = trunc i64 [[SHR_5_I_13]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_13:%.*]] = and i8 [[STOREDV_5_I_12]], [[TMP55]]
+; CHECK-NEXT:    [[SHR_5_I_14:%.*]] = lshr i64 0, [[TMP43]]
+; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[SHR_5_I_14]] to i8
+; CHECK-NEXT:    [[STOREDV_5_I_14:%.*]] = and i8 [[STOREDV_5_I_13]], [[TMP56]]
+; CHECK-NEXT:    [[TMP57:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[SHR_6_I:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[SHR_6_I]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I:%.*]] = and i8 [[STOREDV_5_I_14]], [[TMP58]]
+; CHECK-NEXT:    [[SHR_6_I_1:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = trunc i64 [[SHR_6_I_1]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_1:%.*]] = and i8 [[STOREDV_6_I]], [[TMP59]]
+; CHECK-NEXT:    [[SHR_6_I_2:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[SHR_6_I_2]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_2:%.*]] = and i8 [[STOREDV_6_I_1]], [[TMP60]]
+; CHECK-NEXT:    [[STOREDV_6_I_3:%.*]] = and i8 [[STOREDV_6_I_2]], 0
+; CHECK-NEXT:    [[SHR_6_I_4:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP61:%.*]] = trunc i64 [[SHR_6_I_4]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_4:%.*]] = and i8 [[STOREDV_6_I_3]], [[TMP61]]
+; CHECK-NEXT:    [[SHR_6_I_5:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[SHR_6_I_5]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_5:%.*]] = and i8 [[STOREDV_6_I_4]], [[TMP62]]
+; CHECK-NEXT:    [[SHR_6_I_6:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[SHR_6_I_6]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_6:%.*]] = and i8 [[STOREDV_6_I_5]], [[TMP63]]
+; CHECK-NEXT:    [[SHR_6_I_7:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP64:%.*]] = trunc i64 [[SHR_6_I_7]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_7:%.*]] = and i8 [[STOREDV_6_I_6]], [[TMP64]]
+; CHECK-NEXT:    [[STOREDV_6_I_8:%.*]] = and i8 [[STOREDV_6_I_7]], 0
+; CHECK-NEXT:    [[SHR_6_I_9:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[SHR_6_I_9]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_9:%.*]] = and i8 [[STOREDV_6_I_8]], [[TMP65]]
+; CHECK-NEXT:    [[SHR_6_I_10:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP66:%.*]] = trunc i64 [[SHR_6_I_10]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_10:%.*]] = and i8 [[STOREDV_6_I_9]], [[TMP66]]
+; CHECK-NEXT:    [[SHR_6_I_11:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[SHR_6_I_11]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_11:%.*]] = and i8 [[STOREDV_6_I_10]], [[TMP67]]
+; CHECK-NEXT:    [[SHR_6_I_12:%.*]] = lshr i64 0, [[TMP57]]
+; CHECK-NEXT:    [[TMP68:%.*]] = trunc i64 [[SHR_6_I_12]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_12:%.*]] = and i8 [[STOREDV_6_I_11]], [[TMP68]]
+; CHECK-NEXT:    [[STOREDV_6_I_13:%.*]] = and i8 [[STOREDV_6_I_12]], 0
+; CHECK-NEXT:    [[SHR_6_I_14:%.*]] = lshr i64 0, 0
+; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[SHR_6_I_14]] to i8
+; CHECK-NEXT:    [[STOREDV_6_I_14:%.*]] = and i8 [[STOREDV_6_I_13]], [[TMP69]]
+; CHECK-NEXT:    store i8 [[STOREDV_6_I_14]], ptr null, align 1
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %0 = load i64, ptr getelementptr (i8, ptr null, i64 16), align 8
+  %shr.1.i = lshr i64 0, 0
+  %shr.1.i.13 = lshr i64 0, %0
+  %1 = trunc i64 %shr.1.i.13 to i8
+  %storedv.1.i.13 = and i8 0, %1
+  %shr.1.i.14 = lshr i64 0, %0
+  %2 = trunc i64 %shr.1.i.14 to i8
+  %storedv.1.i.14 = and i8 %storedv.1.i.13, %2
+  %3 = load i64, ptr getelementptr (i8, ptr null, i64 32), align 8
+  %shr.2.i = lshr i64 0, %3
+  %4 = trunc i64 %shr.2.i to i8
+  %storedv.2.i = and i8 %storedv.1.i.14, %4
+  %shr.2.i.1 = lshr i64 0, %3
+  %5 = trunc i64 %shr.2.i.1 to i8
+  %storedv.2.i.1 = and i8 %storedv.2.i, %5
+  %shr.2.i.2 = lshr i64 0, %3
+  %6 = trunc i64 %shr.2.i.2 to i8
+  %storedv.2.i.2 = and i8 %storedv.2.i.1, %6
+  %shr.2.i.3 = lshr i64 0, %3
+  %7 = trunc i64 %shr.2.i.3 to i8
+  %storedv.2.i.3 = and i8 %storedv.2.i.2, %7
+  %shr.2.i.4 = lshr i64 0, %3
+  %8 = trunc i64 %shr.2.i.4 to i8
+  %storedv.2.i.4 = and i8 %storedv.2.i.3, %8
+  %shr.2.i.5 = lshr i64 0, %3
+  %9 = trunc i64 %shr.2.i.5 to i8
+  %storedv.2.i.5 = and i8 %storedv.2.i.4, %9
+  %shr.2.i.6 = lshr i64 0, %3
+  %10 = trunc i64 %shr.2.i.6 to i8
+  %storedv.2.i.6 = and i8 %storedv.2.i.5, %10
+  %shr.2.i.7 = lshr i64 0, %3
+  %11 = trunc i64 %shr.2.i.7 to i8
+  %storedv.2.i.7 = and i8 %storedv.2.i.6, %11
+  %storedv.2.i.8 = and i8 %storedv.2.i.7, 0
+  %shr.2.i.9 = lshr i64 0, %3
+  %12 = trunc i64 %shr.2.i.9 to i8
+  %storedv.2.i.9 = and i8 %storedv.2.i.8, %12
+  %shr.2.i.10 = lshr i64 0, %3
+  %13 = trunc i64 %shr.2.i.10 to i8
+  %storedv.2.i.10 = and i8 %storedv.2.i.9, %13
+  %shr.2.i.11 = lshr i64 0, %3
+  %14 = trunc i64 %shr.2.i.11 to i8
+  %storedv.2.i.11 = and i8 %storedv.2.i.10, %14
+  %shr.2.i.12 = lshr i64 0, %3
+  %15 = trunc i64 %shr.2.i.12 to i8
+  %storedv.2.i.12 = and i8 %storedv.2.i.11, %15
+  %shr.2.i.13 = lshr i64 0, %3
+  %16 = trunc i64 %shr.2.i.13 to i8
+  %storedv.2.i.13 = and i8 %storedv.2.i.12, %16
+  %storedv.2.i.14 = and i8 %storedv.2.i.13, 0
+  %17 = load i64, ptr getelementptr (i8, ptr null, i64 48), align 8
+  %shr.3.i = lshr i64 0, %17
+  %18 = trunc i64 %shr.3.i to i8
+  %storedv.3.i = and i8 %storedv.2.i.14, %18
+  %shr.3.i.1 = lshr i64 0, %17
+  %19 = trunc i64 %shr.3.i.1 to i8
+  %storedv.3.i.1 = and i8 %storedv.3.i, %19
+  %storedv.3.i.2 = and i8 %storedv.3.i.1, 0
+  %shr.3.i.3 = lshr i64 0, %17
+  %20 = trunc i64 %shr.3.i.3 to i8
+  %storedv.3.i.3 = and i8 %storedv.3.i.2, %20
+  %shr.3.i.4 = lshr i64 0, %17
+  %21 = trunc i64 %shr.3.i.4 to i8
+  %storedv.3.i.4 = and i8 %storedv.3.i.3, %21
+  %shr.3.i.5 = lshr i64 0, %17
+  %22 = trunc i64 %shr.3.i.5 to i8
+  %storedv.3.i.5 = and i8 %storedv.3.i.4, %22
+  %storedv.3.i.6 = and i8 %storedv.3.i.5, 0
+  %shr.3.i.7 = lshr i64 0, %17
+  %23 = trunc i64 %shr.3.i.7 to i8
+  %storedv.3.i.7 = and i8 %storedv.3.i.6, %23
+  %shr.3.i.8 = lshr i64 0, %17
+  %24 = trunc i64 %shr.3.i.8 to i8
+  %storedv.3.i.8 = and i8 %storedv.3.i.7, %24
+  %shr.3.i.9 = lshr i64 0, %17
+  %25 = trunc i64 %shr.3.i.9 to i8
+  %storedv.3.i.9 = and i8 %storedv.3.i.8, %25
+  %shr.3.i.10 = lshr i64 0, %17
+  %26 = trunc i64 %shr.3.i.10 to i8
+  %storedv.3.i.10 = and i8 %storedv.3.i.9, %26
+  %shr.3.i.11 = lshr i64 0, %17
+  %27 = trunc i64 %shr.3.i.11 to i8
+  %storedv.3.i.11 = and i8 %storedv.3.i.10, %27
+  %storedv.3.i.12 = and i8 %storedv.3.i.11, 0
+  %shr.3.i.13 = lshr i64 0, %17
+  %28 = trunc i64 %shr.3.i.13 to i8
+  %storedv.3.i.13 = and i8 %storedv.3.i.12, %28
+  %shr.3.i.14 = lshr i64 0, %17
+  %29 = trunc i64 %shr.3.i.14 to i8
+  %storedv.3.i.14 = and i8 %storedv.3.i.13, %29
+  %30 = load i64, ptr null, align 8
+  %storedv.4.i = and i8 %storedv.3.i.14, 0
+  %shr.4.i.1 = lshr i64 0, %30
+  %31 = trunc i64 %shr.4.i.1 to i8
+  %storedv.4.i.1 = and i8 %storedv.4.i, %31
+  %shr.4.i.2 = lshr i64 0, %30
+  %32 = trunc i64 %shr.4.i.2 to i8
+  %storedv.4.i.2 = and i8 %storedv.4.i.1, %32
+  %shr.4.i.3 = lshr i64 0, %30
+  %33 = trunc i64 %shr.4.i.3 to i8
+  %storedv.4.i.3 = and i8 %storedv.4.i.2, %33
+  %shr.4.i.4 = lshr i64 0, %30
+  %34 = trunc i64 %shr.4.i.4 to i8
+  %storedv.4.i.4 = and i8 %storedv.4.i.3, %34
+  %storedv.4.i.5 = and i8 %storedv.4.i.4, 0
+  %shr.4.i.6 = lshr i64 0, %30
+  %35 = trunc i64 %shr.4.i.6 to i8
+  %storedv.4.i.6 = and i8 %storedv.4.i.5, %35
+  %shr.4.i.7 = lshr i64 0, %30
+  %36 = trunc i64 %shr.4.i.7 to i8
+  %storedv.4.i.7 = and i8 %storedv.4.i.6, %36
+  %shr.4.i.8 = lshr i64 0, %30
+  %37 = trunc i64 %shr.4.i.8 to i8
+  %storedv.4.i.8 = and i8 %storedv.4.i.7, %37
+  %shr.4.i.9 = lshr i64 0, %30
+  %38 = trunc i64 %shr.4.i.9 to i8
+  %storedv.4.i.9 = and i8 %storedv.4.i.8, %38
+  %shr.4.i.10 = lshr i64 0, %30
+  %39 = trunc i64 %shr.4.i.10 to i8
+  %storedv.4.i.10 = and i8 %storedv.4.i.9, %39
+  %storedv.4.i.11 = and i8 %storedv.4.i.10, 0
+  %shr.4.i.12 = lshr i64 0, %30
+  %40 = trunc i64 %shr.4.i.12 to i8
+  %storedv.4.i.12 = and i8 %storedv.4.i.11, %40
+  %shr.4.i.13 = lshr i64 0, %30
+  %41 = trunc i64 %shr.4.i.13 to i8
+  %storedv.4.i.13 = and i8 %storedv.4.i.12, %41
+  %shr.4.i.14 = lshr i64 0, %30
+  %42 = trunc i64 %shr.4.i.14 to i8
+  %storedv.4.i.14 = and i8 %storedv.4.i.13, %42
+  %43 = load i64, ptr getelementptr (i8, ptr null, i64 80), align 8
+  %shr.5.i = lshr i64 0, %43
+  %44 = trunc i64 %shr.5.i to i8
+  %storedv.5.i = and i8 %storedv.4.i.14, %44
+  %shr.5.i.1 = lshr i64 0, %43
+  %45 = trunc i64 %shr.5.i.1 to i8
+  %storedv.5.i.1 = and i8 %storedv.5.i, %45
+  %shr.5.i.2 = lshr i64 0, %43
+  %46 = trunc i64 %shr.5.i.2 to i8
+  %storedv.5.i.2 = and i8 %storedv.5.i.1, %46
+  %storedv.5.i.3 = and i8 %storedv.5.i.2, 0
+  %shr.5.i.4 = lshr i64 0, %43
+  %47 = trunc i64 %shr.5.i.4 to i8
+  %storedv.5.i.4 = and i8 %storedv.5.i.3, %47
+  %shr.5.i.5 = lshr i64 0, %43
+  %48 = trunc i64 %shr.5.i.5 to i8
+  %storedv.5.i.5 = and i8 %storedv.5.i.4, %48
+  %shr.5.i.6 = lshr i64 0, %43
+  %49 = trunc i64 %shr.5.i.6 to i8
+  %storedv.5.i.6 = and i8 %storedv.5.i.5, %49
+  %shr.5.i.7 = lshr i64 0, %43
+  %50 = trunc i64 %shr.5.i.7 to i8
+  %storedv.5.i.7 = and i8 %storedv.5.i.6, %50
+  %shr.5.i.8 = lshr i64 0, %43
+  %51 = trunc i64 %shr.5.i.8 to i8
+  %storedv.5.i.8 = and i8 %storedv.5.i.7, %51
+  %storedv.5.i.9 = and i8 %storedv.5.i.8, 0
+  %shr.5.i.10 = lshr i64 0, %43
+  %52 = trunc i64 %shr.5.i.10 to i8
+  %storedv.5.i.10 = and i8 %storedv.5.i.9, %52
+  %shr.5.i.11 = lshr i64 0, %43
+  %53 = trunc i64 %shr.5.i.11 to i8
+  %storedv.5.i.11 = and i8 %storedv.5.i.10, %53
+  %shr.5.i.12 = lshr i64 0, %43
+  %54 = trunc i64 %shr.5.i.12 to i8
+  %storedv.5.i.12 = and i8 %storedv.5.i.11, %54
+  %shr.5.i.13 = lshr i64 0, %43
+  %55 = trunc i64 %shr.5.i.13 to i8
+  %storedv.5.i.13 = and i8 %storedv.5.i.12, %55
+  %shr.5.i.14 = lshr i64 0, %43
+  %56 = trunc i64 %shr.5.i.14 to i8
+  %storedv.5.i.14 = and i8 %storedv.5.i.13, %56
+  %57 = load i64, ptr null, align 8
+  %shr.6.i = lshr i64 0, %57
+  %58 = trunc i64 %shr.6.i to i8
+  %storedv.6.i = and i8 %storedv.5.i.14, %58
+  %shr.6.i.1 = lshr i64 0, %57
+  %59 = trunc i64 %shr.6.i.1 to i8
+  %storedv.6.i.1 = and i8 %storedv.6.i, %59
+  %shr.6.i.2 = lshr i64 0, %57
+  %60 = trunc i64 %shr.6.i.2 to i8
+  %storedv.6.i.2 = and i8 %storedv.6.i.1, %60
+  %storedv.6.i.3 = and i8 %storedv.6.i.2, 0
+  %shr.6.i.4 = lshr i64 0, %57
+  %61 = trunc i64 %shr.6.i.4 to i8
+  %storedv.6.i.4 = and i8 %storedv.6.i.3, %61
+  %shr.6.i.5 = lshr i64 0, %57
+  %62 = trunc i64 %shr.6.i.5 to i8
+  %storedv.6.i.5 = and i8 %storedv.6.i.4, %62
+  %shr.6.i.6 = lshr i64 0, %57
+  %63 = trunc i64 %shr.6.i.6 to i8
+  %storedv.6.i.6 = and i8 %storedv.6.i.5, %63
+  %shr.6.i.7 = lshr i64 0, %57
+  %64 = trunc i64 %shr.6.i.7 to i8
+  %storedv.6.i.7 = and i8 %storedv.6.i.6, %64
+  %storedv.6.i.8 = and i8 %storedv.6.i.7, 0
+  %shr.6.i.9 = lshr i64 0, %57
+  %65 = trunc i64 %shr.6.i.9 to i8
+  %storedv.6.i.9 = and i8 %storedv.6.i.8, %65
+  %shr.6.i.10 = lshr i64 0, %57
+  %66 = trunc i64 %shr.6.i.10 to i8
+  %storedv.6.i.10 = and i8 %storedv.6.i.9, %66
+  %shr.6.i.11 = lshr i64 0, %57
+  %67 = trunc i64 %shr.6.i.11 to i8
+  %storedv.6.i.11 = and i8 %storedv.6.i.10, %67
+  %shr.6.i.12 = lshr i64 0, %57
+  %68 = trunc i64 %shr.6.i.12 to i8
+  %storedv.6.i.12 = and i8 %storedv.6.i.11, %68
+  %storedv.6.i.13 = and i8 %storedv.6.i.12, 0
+  %shr.6.i.14 = lshr i64 0, 0
+  %69 = trunc i64 %shr.6.i.14 to i8
+  %storedv.6.i.14 = and i8 %storedv.6.i.13, %69
+  store i8 %storedv.6.i.14, ptr null, align 1
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
new file mode 100644
index 0000000000000..d822a24220df2
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
+
+define i32 @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[IF_END_I87:%.*]]
+; CHECK:       if.end.i87:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> poison, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP1]], <2 x i32> zeroinitializer, i64 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    switch i32 0, label [[SW_BB509_I:%.*]] [
+; CHECK-NEXT:      i32 1, label [[SW_BB509_I]]
+; CHECK-NEXT:      i32 0, label [[IF_THEN458_I:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       if.then458.i:
+; CHECK-NEXT:    br label [[SW_BB509_I]]
+; CHECK:       sw.bb509.i:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i32> [ [[TMP0]], [[IF_THEN458_I]] ], [ [[TMP3]], [[IF_END_I87]] ], [ [[TMP3]], [[IF_END_I87]] ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %getelementptr0 = getelementptr i8, ptr null, i64 64036
+  %getelementptr1 = getelementptr i8, ptr null, i64 64064
+  br label %if.end.i87
+
+if.end.i87:                                       ; preds = %entry
+  %0 = load <2 x i32>, ptr %getelementptr0, align 4
+  %1 = load <2 x i32>, ptr %getelementptr1, align 8
+  switch i32 0, label %sw.bb509.i [
+  i32 1, label %sw.bb509.i
+  i32 0, label %if.then458.i
+  ]
+
+if.then458.i:                                     ; preds = %if.end.i87
+  br label %sw.bb509.i
+
+sw.bb509.i:                                       ; preds = %if.then458.i, %if.end.i87, %if.end.i87
+  %4 = phi <2 x i32> [ %0, %if.then458.i ], [ %0, %if.end.i87 ], [ %0, %if.end.i87 ]
+  %5 = phi <2 x i32> [ %1, %if.then458.i ], [ zeroinitializer, %if.end.i87 ], [ zeroinitializer, %if.end.i87 ]
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
new file mode 100644
index 0000000000000..7e75970de3492
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i1 @test(i32 %g, i16 %d) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: i32 [[G:%.*]], i16 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = and i16 [[D]], 1
+; CHECK-NEXT:    [[XOR_I_I:%.*]] = xor i32 [[G]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[G]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[XOR_I_I]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[TMP5]], <i8 -9, i8 -9, i8 -1, i8 -1>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i8> [[TMP6]], <i8 -3, i8 -3, i8 -3, i8 -3>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i32> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP12]])
+; CHECK-NEXT:    ret i1 [[TMP13]]
+;
+entry:
+  %0 = and i16 %d, 1
+  %xor.i.i = xor i32 %g, 1
+  %conv1.i.i = trunc i32 %xor.i.i to i8
+  %notsub.i = add i8 %conv1.i.i, -1
+  %cmp.i.i = icmp sgt i8 %notsub.i, -3
+  %conv3.i.i = zext i1 %cmp.i.i to i32
+  %cmp4.i.i = icmp sgt i32 %xor.i.i, %conv3.i.i
+  %conv1.1.i.i = trunc i32 %g to i8
+  %notsub25.i = add i8 %conv1.1.i.i, -1
+  %cmp.1.i.i = icmp sgt i8 %notsub25.i, -3
+  %conv3.1.i.i = zext i1 %cmp.1.i.i to i32
+  %cmp4.1.i.i = icmp sgt i32 %g, %conv3.1.i.i
+  %notsub26.i = add i8 %conv1.1.i.i, -9
+  %cmp.i17.i = icmp sgt i8 %notsub26.i, -3
+  %conv3.i18.i = zext i1 %cmp.i17.i to i32
+  %cmp4.i19.i = icmp sgt i32 %g, %conv3.i18.i
+  %notsub27.i = add i8 %conv1.i.i, -9
+  %cmp.1.i22.i = icmp sgt i8 %notsub27.i, -3
+  %conv3.1.i23.i = zext i1 %cmp.1.i22.i to i32
+  %cmp4.1.i24.i = icmp sgt i32 %xor.i.i, %conv3.1.i23.i
+  %1 = and i1 %cmp4.i19.i, %cmp4.1.i24.i
+  %2 = and i1 %cmp4.i.i, %1
+  %3 = and i1 %cmp4.1.i.i, %2
+  ret i1 %3
+}
diff --git a/llvm/test/Verifier/rtsan-attrs.ll b/llvm/test/Verifier/rtsan-attrs.ll
new file mode 100644
index 0000000000000..42ab85163642b
--- /dev/null
+++ b/llvm/test/Verifier/rtsan-attrs.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: Attributes 'sanitize_realtime and nosanitize_realtime' are incompatible!
+; CHECK-NEXT: ptr @sanitize_nosanitize
+define void @sanitize_nosanitize() #0 {
+  ret void
+}
+
+attributes #0 = { sanitize_realtime nosanitize_realtime }
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 5094871a1d415..b47c77c5f2ff3 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -1183,11 +1183,9 @@ void ProfileGeneratorBase::extractProbesFromRange(
     do {
       const AddressProbesMap &Address2ProbesMap =
           Binary->getAddress2ProbesMap();
-      auto It = Address2ProbesMap.find(IP.Address);
-      if (It != Address2ProbesMap.end()) {
-        for (const MCDecodedPseudoProbe &Probe : It->second) {
-          ProbeCounter[&Probe] += Count;
-        }
+      for (const MCDecodedPseudoProbe &Probe :
+           Address2ProbesMap.find(IP.Address)) {
+        ProbeCounter[&Probe] += Count;
       }
     } while (IP.advance() && IP.Address <= RangeEnd);
   }
@@ -1293,9 +1291,9 @@ void CSProfileGenerator::populateBodySamplesWithProbes(
   // and will be inferred by the compiler.
   for (auto &I : FrameSamples) {
     for (auto *FunctionProfile : I.second) {
-      for (auto *Probe : I.first->getProbes()) {
-        FunctionProfile->addBodySamples(Probe->getIndex(),
-                                        Probe->getDiscriminator(), 0);
+      for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
+        FunctionProfile->addBodySamples(Probe.getIndex(),
+                                        Probe.getDiscriminator(), 0);
       }
     }
   }
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index a458ffcb96b41..e4fc3816cd0c4 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -132,7 +132,7 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
     MCPseudoProbeDecoder &ProbeDecoder) {
   ProbeFrameStack ProbeContext;
   for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren())
-    trackInlineesOptimizedAway(ProbeDecoder, *Child.second, ProbeContext);
+    trackInlineesOptimizedAway(ProbeDecoder, Child, ProbeContext);
 }
 
 void BinarySizeContextTracker::trackInlineesOptimizedAway(
@@ -160,9 +160,9 @@ void BinarySizeContextTracker::trackInlineesOptimizedAway(
 
   // DFS down the probe inline tree
   for (const auto &ChildNode : ProbeNode.getChildren()) {
-    InlineSite Location = ChildNode.first;
+    InlineSite Location = ChildNode.getInlineSite();
     ProbeContext.back().second = std::get<1>(Location);
-    trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second, ProbeContext);
+    trackInlineesOptimizedAway(ProbeDecoder, ChildNode, ProbeContext);
   }
 
   ProbeContext.pop_back();
@@ -454,8 +454,8 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
   // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe
   // is available
   if (TrackFuncContextSize) {
-    for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
-      auto *Frame = Child.second.get();
+    for (auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
+      auto *Frame = &Child;
       StringRef FuncName =
           ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName;
       TopLevelProbeFrameMap[FuncName] = Frame;
diff --git a/llvm/unittests/ADT/FunctionRefTest.cpp b/llvm/unittests/ADT/FunctionRefTest.cpp
index 47633cada0f3c..b181933973037 100644
--- a/llvm/unittests/ADT/FunctionRefTest.cpp
+++ b/llvm/unittests/ADT/FunctionRefTest.cpp
@@ -59,4 +59,14 @@ TEST(FunctionRefTest, SFINAE) {
   EXPECT_EQ("string", returns([] { return "hello"; }));
 }
 
+TEST(FunctionRefTest, Equality) {
+  function_ref<int()> X = [] { return 1; };
+  function_ref<int()> Y = X;
+  EXPECT_EQ(X, Y);
+
+  const auto Lambda = []() { return 0; };
+  function_ref<int()> A(Lambda), B(Lambda);
+  EXPECT_EQ(A, B);
+}
+
 } // namespace
diff --git a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.h b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.h
index fd31e95cce13d..544dcabb56d0a 100644
--- a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.h
+++ b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.h
@@ -53,11 +53,9 @@ std::ostream &
 operator<<(std::ostream &OS, const MachineFunction &MF);
 }
 
-static std::unique_ptr<Module> parseMIR(LLVMContext &Context,
-                                        std::unique_ptr<MIRParser> &MIR,
-                                        const TargetMachine &TM,
-                                        StringRef MIRCode, const char *FuncName,
-                                        MachineModuleInfo &MMI) {
+static std::unique_ptr<Module>
+parseMIR(LLVMContext &Context, std::unique_ptr<MIRParser> &MIR,
+         const TargetMachine &TM, StringRef MIRCode, MachineModuleInfo &MMI) {
   SMDiagnostic Diagnostic;
   std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
   MIR = createMIRParser(std::move(MBuffer), Context);
@@ -80,8 +78,7 @@ createDummyModule(LLVMContext &Context, const LLVMTargetMachine &TM,
                   StringRef MIRString, const char *FuncName) {
   std::unique_ptr<MIRParser> MIR;
   auto MMI = std::make_unique<MachineModuleInfo>(&TM);
-  std::unique_ptr<Module> M =
-      parseMIR(Context, MIR, TM, MIRString, FuncName, *MMI);
+  std::unique_ptr<Module> M = parseMIR(Context, MIR, TM, MIRString, *MMI);
   return make_pair(std::move(M), std::move(MMI));
 }
 
diff --git a/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp b/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp
index 9dcf3754a5bd7..f8505817d2e09 100644
--- a/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp
+++ b/llvm/unittests/CodeGen/MachineDomTreeUpdaterTest.cpp
@@ -73,7 +73,7 @@ class MachineDomTreeUpdaterTest : public testing::Test {
     MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); });
   }
 
-  bool parseMIR(StringRef MIRCode, const char *FnName) {
+  bool parseMIR(StringRef MIRCode) {
     SMDiagnostic Diagnostic;
     std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
     MIR = createMIRParser(std::move(MBuffer), Context);
@@ -149,7 +149,7 @@ body:             |
 ...
 )";
 
-  ASSERT_TRUE(parseMIR(MIRString, "f0"));
+  ASSERT_TRUE(parseMIR(MIRString));
 
   auto &MF =
       FAM.getResult<MachineFunctionAnalysis>(*M->getFunction("f0")).getMF();
@@ -239,7 +239,7 @@ body:             |
 ...
 )";
 
-  ASSERT_TRUE(parseMIR(MIRString, "f0"));
+  ASSERT_TRUE(parseMIR(MIRString));
 
   auto &MF =
       FAM.getResult<MachineFunctionAnalysis>(*M->getFunction("f0")).getMF();
diff --git a/llvm/unittests/MI/LiveIntervalTest.cpp b/llvm/unittests/MI/LiveIntervalTest.cpp
index bba5cb84d1152..7dcd82f3e7aa6 100644
--- a/llvm/unittests/MI/LiveIntervalTest.cpp
+++ b/llvm/unittests/MI/LiveIntervalTest.cpp
@@ -55,8 +55,10 @@ std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
-    legacy::PassManagerBase &PM, std::unique_ptr<MIRParser> &MIR,
-    const LLVMTargetMachine &TM, StringRef MIRCode, const char *FuncName) {
+                                 legacy::PassManagerBase &PM,
+                                 std::unique_ptr<MIRParser> &MIR,
+                                 const LLVMTargetMachine &TM,
+                                 StringRef MIRCode) {
   SMDiagnostic Diagnostic;
   std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
   MIR = createMIRParser(std::move(MBuffer), Context);
@@ -209,7 +211,7 @@ static void doTest(StringRef MIRFunc,
 
   legacy::PassManager PM;
   std::unique_ptr<MIRParser> MIR;
-  std::unique_ptr<Module> M = parseMIR(Context, PM, MIR, *TM, MIRFunc, "func");
+  std::unique_ptr<Module> M = parseMIR(Context, PM, MIR, *TM, MIRFunc);
   ASSERT_TRUE(M);
 
   PM.add(new TestPassT<AnalysisType>(T, ShouldPass));
diff --git a/llvm/unittests/MIR/MachineMetadata.cpp b/llvm/unittests/MIR/MachineMetadata.cpp
index 9b1c3ef1c465a..cd30768f7ce76 100644
--- a/llvm/unittests/MIR/MachineMetadata.cpp
+++ b/llvm/unittests/MIR/MachineMetadata.cpp
@@ -79,7 +79,7 @@ class MachineMetadataTest : public testing::Test {
   }
 
   std::unique_ptr<Module> parseMIR(const TargetMachine &TM, StringRef MIRCode,
-                                   const char *FnName, MachineModuleInfo &MMI) {
+                                   MachineModuleInfo &MMI) {
     SMDiagnostic Diagnostic;
     std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
     MIR = createMIRParser(std::move(MBuffer), Context);
@@ -227,7 +227,7 @@ body:             |
 )MIR";
 
   MachineModuleInfo MMI(TM.get());
-  M = parseMIR(*TM, MIRString, "test0", MMI);
+  M = parseMIR(*TM, MIRString, MMI);
   ASSERT_TRUE(M);
 
   auto *MF = MMI.getMachineFunction(*M->getFunction("test0"));
@@ -338,7 +338,7 @@ body:             |
 )MIR";
 
   MachineModuleInfo MMI(TM.get());
-  M = parseMIR(*TM, MIRString, "test0", MMI);
+  M = parseMIR(*TM, MIRString, MMI);
   ASSERT_TRUE(M);
 
   auto *MF = MMI.getMachineFunction(*M->getFunction("test0"));
@@ -376,7 +376,7 @@ body:             |
 )MIR";
 
   MachineModuleInfo MMI(TM.get());
-  M = parseMIR(*TM, MIRString, "test0", MMI);
+  M = parseMIR(*TM, MIRString, MMI);
   ASSERT_TRUE(M);
 
   auto *MF = MMI.getMachineFunction(*M->getFunction("test0"));
@@ -474,7 +474,7 @@ body:             |
 )MIR";
 
   MachineModuleInfo MMI(TM.get());
-  M = parseMIR(*TM, MIRString, "test0", MMI);
+  M = parseMIR(*TM, MIRString, MMI);
   ASSERT_TRUE(M);
 
   auto *MF = MMI.getMachineFunction(*M->getFunction("test0"));
@@ -563,7 +563,7 @@ body:             |
 ...
 )MIR";
   MachineModuleInfo MMI(TM.get());
-  M = parseMIR(*TM, MIRString, "foo", MMI);
+  M = parseMIR(*TM, MIRString, MMI);
   ASSERT_TRUE(M);
   auto *MF = MMI.getMachineFunction(*M->getFunction("foo"));
   MachineFunctionProperties &Properties = MF->getProperties();
@@ -594,7 +594,7 @@ body:             |
 ...
 )MIR";
   MachineModuleInfo MMI(TM.get());
-  M = parseMIR(*TM, MIRString, "foo", MMI);
+  M = parseMIR(*TM, MIRString, MMI);
   ASSERT_TRUE(M);
   auto *MF = MMI.getMachineFunction(*M->getFunction("foo"));
   MachineFunctionProperties &Properties = MF->getProperties();
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 83edd954080e9..869e752b23913 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -580,6 +580,81 @@ define void @foo(i8 %v1) {
   EXPECT_EQ(I0->getNextNode(), Ret);
 }
 
+TEST_F(SandboxIRTest, FreezeInst) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+  freeze i8 %arg
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *Arg = F->getArg(0);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Freeze = cast<sandboxir::FreezeInst>(&*It++);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  EXPECT_TRUE(isa<sandboxir::UnaryInstruction>(Freeze));
+  EXPECT_EQ(Freeze->getOperand(0), Arg);
+
+  // Check create().
+  auto *NewFreeze = sandboxir::FreezeInst::create(
+      Arg, Ret->getIterator(), Ret->getParent(), Ctx, "NewFreeze");
+  EXPECT_EQ(NewFreeze->getNextNode(), Ret);
+#ifndef NDEBUG
+  EXPECT_EQ(NewFreeze->getName(), "NewFreeze");
+#endif // NDEBUG
+}
+
+TEST_F(SandboxIRTest, FenceInst) {
+  parseIR(C, R"IR(
+define void @foo() {
+  fence syncscope("singlethread") seq_cst
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  llvm::BasicBlock *LLVMBB = &*LLVMF->begin();
+  auto *LLVMFence = cast<llvm::FenceInst>(&*LLVMBB->begin());
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Fence = cast<sandboxir::FenceInst>(&*It++);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check getOrdering().
+  EXPECT_EQ(Fence->getOrdering(), LLVMFence->getOrdering());
+  // Check setOrdering().
+  auto OrigOrdering = Fence->getOrdering();
+  auto NewOrdering = AtomicOrdering::Release;
+  EXPECT_NE(NewOrdering, OrigOrdering);
+  Fence->setOrdering(NewOrdering);
+  EXPECT_EQ(Fence->getOrdering(), NewOrdering);
+  Fence->setOrdering(OrigOrdering);
+  EXPECT_EQ(Fence->getOrdering(), OrigOrdering);
+  // Check getSyncScopeID().
+  EXPECT_EQ(Fence->getSyncScopeID(), LLVMFence->getSyncScopeID());
+  // Check setSyncScopeID().
+  auto OrigSSID = Fence->getSyncScopeID();
+  auto NewSSID = SyncScope::System;
+  EXPECT_NE(NewSSID, OrigSSID);
+  Fence->setSyncScopeID(NewSSID);
+  EXPECT_EQ(Fence->getSyncScopeID(), NewSSID);
+  Fence->setSyncScopeID(OrigSSID);
+  EXPECT_EQ(Fence->getSyncScopeID(), OrigSSID);
+  // Check create().
+  auto *NewFence =
+      sandboxir::FenceInst::create(AtomicOrdering::Release, Ret->getIterator(),
+                                   BB, Ctx, SyncScope::SingleThread);
+  EXPECT_EQ(NewFence->getNextNode(), Ret);
+  EXPECT_EQ(NewFence->getOrdering(), AtomicOrdering::Release);
+  EXPECT_EQ(NewFence->getSyncScopeID(), SyncScope::SingleThread);
+}
+
 TEST_F(SandboxIRTest, SelectInst) {
   parseIR(C, R"IR(
 define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
@@ -1867,6 +1942,67 @@ define void @foo(i8 %arg) {
   }
 }
 
+TEST_F(SandboxIRTest, LandingPadInst) {
+  parseIR(C, R"IR(
+define void @foo() {
+entry:
+  invoke void @foo()
+      to label %bb unwind label %unwind
+unwind:
+  %lpad = landingpad { ptr, i32 }
+            catch ptr null
+  ret void
+bb:
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  auto *LLVMUnwind = getBasicBlockByName(LLVMF, "unwind");
+  auto *LLVMLPad = cast<llvm::LandingPadInst>(&*LLVMUnwind->begin());
+
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+  auto *Unwind = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMUnwind));
+  auto *BB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb")));
+  auto It = Unwind->begin();
+  auto *LPad = cast<sandboxir::LandingPadInst>(&*It++);
+  [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check isCleanup().
+  EXPECT_EQ(LPad->isCleanup(), LLVMLPad->isCleanup());
+  // Check setCleanup().
+  auto OrigIsCleanup = LPad->isCleanup();
+  auto NewIsCleanup = true;
+  EXPECT_NE(NewIsCleanup, OrigIsCleanup);
+  LPad->setCleanup(NewIsCleanup);
+  EXPECT_EQ(LPad->isCleanup(), NewIsCleanup);
+  LPad->setCleanup(OrigIsCleanup);
+  EXPECT_EQ(LPad->isCleanup(), OrigIsCleanup);
+  // Check getNumClauses().
+  EXPECT_EQ(LPad->getNumClauses(), LLVMLPad->getNumClauses());
+  // Check getClause().
+  for (auto Idx : seq<unsigned>(0, LPad->getNumClauses()))
+    EXPECT_EQ(LPad->getClause(Idx), Ctx.getValue(LLVMLPad->getClause(Idx)));
+  // Check isCatch().
+  for (auto Idx : seq<unsigned>(0, LPad->getNumClauses()))
+    EXPECT_EQ(LPad->isCatch(Idx), LLVMLPad->isCatch(Idx));
+  // Check isFilter().
+  for (auto Idx : seq<unsigned>(0, LPad->getNumClauses()))
+    EXPECT_EQ(LPad->isFilter(Idx), LLVMLPad->isFilter(Idx));
+  // Check create().
+  auto *BBRet = &*BB->begin();
+  auto *NewLPad =
+      cast<sandboxir::LandingPadInst>(sandboxir::LandingPadInst::create(
+          Type::getInt8Ty(C), 0, BBRet->getIterator(), BBRet->getParent(), Ctx,
+          "NewLPad"));
+  EXPECT_EQ(NewLPad->getNextNode(), BBRet);
+  EXPECT_FALSE(NewLPad->isCleanup());
+#ifndef NDEBUG
+  EXPECT_EQ(NewLPad->getName(), "NewLPad");
+#endif // NDEBUG
+}
+
 TEST_F(SandboxIRTest, FuncletPadInst_CatchPadInst_CleanupPadInst) {
   parseIR(C, R"IR(
 define void @foo() {
@@ -2825,6 +2961,31 @@ define void @foo(i8 %arg0, i8 %arg1, float %farg0, float %farg1) {
   }
 }
 
+TEST_F(SandboxIRTest, PossiblyDisjointInst) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg0, i8 %arg1) {
+  %or = or i8 %arg0, %arg1
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *PDI = cast<sandboxir::PossiblyDisjointInst>(&*It++);
+
+  // Check setIsDisjoint(), isDisjoint().
+  auto OrigIsDisjoint = PDI->isDisjoint();
+  auto NewIsDisjoint = true;
+  EXPECT_NE(NewIsDisjoint, OrigIsDisjoint);
+  PDI->setIsDisjoint(NewIsDisjoint);
+  EXPECT_EQ(PDI->isDisjoint(), NewIsDisjoint);
+  PDI->setIsDisjoint(OrigIsDisjoint);
+  EXPECT_EQ(PDI->isDisjoint(), OrigIsDisjoint);
+}
+
 TEST_F(SandboxIRTest, AtomicRMWInst) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr, i8 %arg) {
@@ -3521,6 +3682,48 @@ define void @foo(i32 %arg, float %farg, double %darg, ptr %ptr) {
   }
 }
 
+TEST_F(SandboxIRTest, PossiblyNonNegInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg, float %farg, double %darg, ptr %ptr) {
+  %zext = zext i32 %arg to i64
+  %uitofp = uitofp i32 %arg to float
+
+  %sext = sext i32 %arg to i64
+  %fptoui = fptoui float %farg to i32
+  %fptosi = fptosi float %farg to i32
+  %fpext = fpext float %farg to double
+  %ptrtoint = ptrtoint ptr %ptr to i32
+  %inttoptr = inttoptr i32 %arg to ptr
+  %sitofp = sitofp i32 %arg to float
+  %trunc = trunc i32 %arg to i16
+  %fptrunc = fptrunc double %darg to float
+  %bitcast = bitcast i32 %arg to float
+  %addrspacecast = addrspacecast ptr %ptr to ptr addrspace(1)
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *PNNI0 = cast<sandboxir::PossiblyNonNegInst>(&*It++);
+  auto *PNNI1 = cast<sandboxir::PossiblyNonNegInst>(&*It++);
+  for (auto ItE = BB->end(); It != ItE; ++It)
+    EXPECT_FALSE(isa<sandboxir::PossiblyNonNegInst>(&*It++));
+
+  for (auto *PNNI : {PNNI0, PNNI1}) {
+    // Check setNonNeg(), hasNonNeg().
+    auto OrigNonNeg = PNNI->hasNonNeg();
+    auto NewNonNeg = true;
+    EXPECT_NE(NewNonNeg, OrigNonNeg);
+    PNNI->setNonNeg(NewNonNeg);
+    EXPECT_EQ(PNNI->hasNonNeg(), NewNonNeg);
+    PNNI->setNonNeg(OrigNonNeg);
+    EXPECT_EQ(PNNI->hasNonNeg(), OrigNonNeg);
+  }
+}
+
 /// CastInst's subclasses are very similar so we can use a common test function
 /// for them.
 template <typename SubclassT, sandboxir::Instruction::Opcode OpcodeT>
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index f0d6a0d57b8c3..ca6effb727bf3 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -542,6 +542,40 @@ define void @foo(ptr %ptr) {
   EXPECT_EQ(It, BB->end());
 }
 
+TEST_F(TrackerTest, FenceInstSetters) {
+  parseIR(C, R"IR(
+define void @foo() {
+  fence syncscope("singlethread") seq_cst
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Fence = cast<sandboxir::FenceInst>(&*It++);
+
+  // Check setOrdering().
+  auto OrigOrdering = Fence->getOrdering();
+  auto NewOrdering = AtomicOrdering::Release;
+  EXPECT_NE(NewOrdering, OrigOrdering);
+  Ctx.save();
+  Fence->setOrdering(NewOrdering);
+  EXPECT_EQ(Fence->getOrdering(), NewOrdering);
+  Ctx.revert();
+  EXPECT_EQ(Fence->getOrdering(), OrigOrdering);
+  // Check setSyncScopeID().
+  auto OrigSSID = Fence->getSyncScopeID();
+  auto NewSSID = SyncScope::System;
+  EXPECT_NE(NewSSID, OrigSSID);
+  Ctx.save();
+  Fence->setSyncScopeID(NewSSID);
+  EXPECT_EQ(Fence->getSyncScopeID(), NewSSID);
+  Ctx.revert();
+  EXPECT_EQ(Fence->getSyncScopeID(), OrigSSID);
+}
+
 TEST_F(TrackerTest, CallBaseSetters) {
   parseIR(C, R"IR(
 declare void @bar1(i8)
@@ -713,6 +747,41 @@ define void @foo(i32 %cond0, i32 %cond1) {
   EXPECT_EQ(*HIt++, Handler1);
 }
 
+TEST_F(TrackerTest, LandingPadInstSetters) {
+  parseIR(C, R"IR(
+define void @foo() {
+entry:
+  invoke void @foo()
+      to label %bb unwind label %unwind
+unwind:
+  %lpad = landingpad { ptr, i32 }
+            catch ptr null
+  ret void
+bb:
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  auto *LLVMUnwind = getBasicBlockByName(LLVMF, "unwind");
+
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+  auto *Unwind = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMUnwind));
+  auto It = Unwind->begin();
+  auto *LPad = cast<sandboxir::LandingPadInst>(&*It++);
+  [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check setCleanup().
+  auto OrigIsCleanup = LPad->isCleanup();
+  auto NewIsCleanup = true;
+  EXPECT_NE(NewIsCleanup, OrigIsCleanup);
+  Ctx.save();
+  LPad->setCleanup(NewIsCleanup);
+  EXPECT_EQ(LPad->isCleanup(), NewIsCleanup);
+  Ctx.revert();
+  EXPECT_EQ(LPad->isCleanup(), OrigIsCleanup);
+}
+
 TEST_F(TrackerTest, CatchReturnInstSetters) {
   parseIR(C, R"IR(
 define void @foo() {
@@ -919,6 +988,58 @@ define void @foo(<2 x i8> %v1, <2 x i8> %v2) {
   EXPECT_THAT(SVI->getShuffleMask(), testing::ElementsAreArray(OrigMask));
 }
 
+TEST_F(TrackerTest, PossiblyDisjointInstSetters) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg0, i8 %arg1) {
+  %or = or i8 %arg0, %arg1
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *PDI = cast<sandboxir::PossiblyDisjointInst>(&*It++);
+
+  // Check setIsDisjoint().
+  auto OrigIsDisjoint = PDI->isDisjoint();
+  auto NewIsDisjoint = true;
+  EXPECT_NE(NewIsDisjoint, OrigIsDisjoint);
+  Ctx.save();
+  PDI->setIsDisjoint(NewIsDisjoint);
+  EXPECT_EQ(PDI->isDisjoint(), NewIsDisjoint);
+  Ctx.revert();
+  EXPECT_EQ(PDI->isDisjoint(), OrigIsDisjoint);
+}
+
+TEST_F(TrackerTest, PossiblyNonNegInstSetters) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %zext = zext i32 %arg to i64
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *PNNI = cast<sandboxir::PossiblyNonNegInst>(&*It++);
+
+  // Check setNonNeg().
+  auto OrigNonNeg = PNNI->hasNonNeg();
+  auto NewNonNeg = true;
+  EXPECT_NE(NewNonNeg, OrigNonNeg);
+  Ctx.save();
+  PNNI->setNonNeg(NewNonNeg);
+  EXPECT_EQ(PNNI->hasNonNeg(), NewNonNeg);
+  Ctx.revert();
+  EXPECT_EQ(PNNI->hasNonNeg(), OrigNonNeg);
+}
+
 TEST_F(TrackerTest, AtomicRMWSetters) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr, i8 %arg) {
diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
index 5838ab6f782ba..55caaf5d13b6c 100644
--- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
+++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
@@ -43,7 +43,7 @@ std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
                                  std::unique_ptr<MIRParser> &MIR,
                                  const TargetMachine &TM, StringRef MIRCode,
-                                 const char *FuncName, MachineModuleInfo &MMI) {
+                                 MachineModuleInfo &MMI) {
   SMDiagnostic Diagnostic;
   std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
   MIR = createMIRParser(std::move(MBuffer), Context);
@@ -157,8 +157,7 @@ body: |
   LLVMContext Context;
   std::unique_ptr<MIRParser> MIR;
   MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M =
-      parseMIR(Context, MIR, *TM, MIRString, "test0", MMI);
+  std::unique_ptr<Module> M = parseMIR(Context, MIR, *TM, MIRString, MMI);
   ASSERT_TRUE(M);
 
   Function *F = M->getFunction("test0");
@@ -332,8 +331,7 @@ body: |
   LLVMContext Context;
   std::unique_ptr<MIRParser> MIR;
   MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M =
-      parseMIR(Context, MIR, *TM, MIRString, "test1", MMI);
+  std::unique_ptr<Module> M = parseMIR(Context, MIR, *TM, MIRString, MMI);
   ASSERT_TRUE(M);
 
   Function *F = M->getFunction("test1");
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 3b630e3cf014e..f351087ad212f 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1013,7 +1013,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
         InTok = false;
         IsIsolatedToken = false;
       }
-      addAsmOperand(String.slice(i, i + 1), IsIsolatedToken);
+      addAsmOperand(String.substr(i, 1), IsIsolatedToken);
       Prev = i + 1;
       IsIsolatedToken = true;
       continue;
@@ -1037,7 +1037,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
       }
       ++i;
       assert(i != String.size() && "Invalid quoted character");
-      addAsmOperand(String.slice(i, i + 1), IsIsolatedToken);
+      addAsmOperand(String.substr(i, 1), IsIsolatedToken);
       Prev = i + 1;
       IsIsolatedToken = false;
       break;
diff --git a/llvm/utils/TableGen/CallingConvEmitter.cpp b/llvm/utils/TableGen/CallingConvEmitter.cpp
index ec6ef56a66fa0..6a3030bfc1b7e 100644
--- a/llvm/utils/TableGen/CallingConvEmitter.cpp
+++ b/llvm/utils/TableGen/CallingConvEmitter.cpp
@@ -160,7 +160,7 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent,
       ListInit *RegList = Action->getValueAsListInit("RegList");
       if (RegList->size() == 1) {
         std::string Name = getQualifiedName(RegList->getElementAsRecord(0));
-        O << IndentStr << "if (unsigned Reg = State.AllocateReg(" << Name
+        O << IndentStr << "if (MCRegister Reg = State.AllocateReg(" << Name
           << ")) {\n";
         if (SwiftAction)
           AssignedSwiftRegsMap[CurrentAction].insert(Name);
@@ -180,7 +180,7 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent,
           O << LS << Name;
         }
         O << "\n" << IndentStr << "};\n";
-        O << IndentStr << "if (unsigned Reg = State.AllocateReg(RegList"
+        O << IndentStr << "if (MCRegister Reg = State.AllocateReg(RegList"
           << Counter << ")) {\n";
       }
       O << IndentStr << "  State.addLoc(CCValAssign::getReg(ValNo, ValVT, "
@@ -217,7 +217,7 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent,
                         "Invalid length of list of shadowed registers");
 
       if (RegList->size() == 1) {
-        O << IndentStr << "if (unsigned Reg = State.AllocateReg(";
+        O << IndentStr << "if (MCRegister Reg = State.AllocateReg(";
         O << getQualifiedName(RegList->getElementAsRecord(0));
         O << ", " << getQualifiedName(ShadowRegList->getElementAsRecord(0));
         O << ")) {\n";
@@ -241,7 +241,7 @@ void CallingConvEmitter::EmitAction(Record *Action, unsigned Indent,
           O << LSS << getQualifiedName(ShadowRegList->getElementAsRecord(i));
         O << "\n" << IndentStr << "};\n";
 
-        O << IndentStr << "if (unsigned Reg = State.AllocateReg(RegList"
+        O << IndentStr << "if (MCRegister Reg = State.AllocateReg(RegList"
           << RegListNumber << ", "
           << "RegList" << ShadowRegListNumber << ")) {\n";
       }
diff --git a/llvm/utils/TableGen/DisassemblerEmitter.cpp b/llvm/utils/TableGen/DisassemblerEmitter.cpp
index d41750075b41f..f2c25d38ad2a7 100644
--- a/llvm/utils/TableGen/DisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/DisassemblerEmitter.cpp
@@ -11,6 +11,7 @@
 #include "WebAssemblyDisassemblerEmitter.h"
 #include "X86DisassemblerTables.h"
 #include "X86RecognizableInstr.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index 8e536c99f627f..4c211cdca84c5 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -65,6 +65,14 @@ class IntrinsicEmitter {
   void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints,
                                  bool IsClang, raw_ostream &OS);
 };
+
+// Helper class to use with `TableGen::Emitter::OptClass`.
+template <bool Enums> class IntrinsicEmitterOpt : public IntrinsicEmitter {
+public:
+  IntrinsicEmitterOpt(const RecordKeeper &R) : IntrinsicEmitter(R) {}
+  void run(raw_ostream &OS) { IntrinsicEmitter::run(OS, Enums); }
+};
+
 } // End anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -770,16 +778,8 @@ Intrinsic::getIntrinsicFor{1}Builtin(StringRef TargetPrefix,
                 UpperCompilerName);
 }
 
-static void EmitIntrinsicEnums(RecordKeeper &RK, raw_ostream &OS) {
-  IntrinsicEmitter(RK).run(OS, /*Enums=*/true);
-}
-
-static TableGen::Emitter::Opt X("gen-intrinsic-enums", EmitIntrinsicEnums,
-                                "Generate intrinsic enums");
-
-static void EmitIntrinsicImpl(RecordKeeper &RK, raw_ostream &OS) {
-  IntrinsicEmitter(RK).run(OS, /*Enums=*/false);
-}
+static TableGen::Emitter::OptClass<IntrinsicEmitterOpt</*Enums=*/true>>
+    X("gen-intrinsic-enums", "Generate intrinsic enums");
 
-static TableGen::Emitter::Opt Y("gen-intrinsic-impl", EmitIntrinsicImpl,
-                                "Generate intrinsic implementation code");
+static TableGen::Emitter::OptClass<IntrinsicEmitterOpt</*Enums=*/false>>
+    Y("gen-intrinsic-impl", "Generate intrinsic implementation code");
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp
index c420843574cbf..7ee6fa5c83211 100644
--- a/llvm/utils/TableGen/TableGen.cpp
+++ b/llvm/utils/TableGen/TableGen.cpp
@@ -39,7 +39,7 @@ static cl::opt<std::string> Class("class",
                                   cl::value_desc("class name"),
                                   cl::cat(PrintEnumsCat));
 
-static void PrintRecords(RecordKeeper &Records, raw_ostream &OS) {
+static void PrintRecords(const RecordKeeper &Records, raw_ostream &OS) {
   OS << Records; // No argument, dump all contents
 }
 
@@ -49,14 +49,14 @@ static void PrintEnums(RecordKeeper &Records, raw_ostream &OS) {
   OS << "\n";
 }
 
-static void PrintSets(RecordKeeper &Records, raw_ostream &OS) {
+static void PrintSets(const RecordKeeper &Records, raw_ostream &OS) {
   SetTheory Sets;
   Sets.addFieldExpander("Set", "Elements");
   for (Record *Rec : Records.getAllDerivedDefinitions("Set")) {
     OS << Rec->getName() << " = [";
     const std::vector<Record *> *Elts = Sets.expand(Rec);
     assert(Elts && "Couldn't expand Set instance");
-    for (Record *Elt : *Elts)
+    for (const Record *Elt : *Elts)
       OS << ' ' << Elt->getName();
     OS << " ]\n";
   }
@@ -67,7 +67,7 @@ static TableGen::Emitter::Opt X[] = {
      true},
     {"print-detailed-records", EmitDetailedRecords,
      "Print full details of all records to stdout"},
-    {"null-backend", [](RecordKeeper &Records, raw_ostream &OS) {},
+    {"null-backend", [](const RecordKeeper &Records, raw_ostream &OS) {},
      "Do nothing after parsing (useful for timing)"},
     {"dump-json", EmitJSON, "Dump all records as machine-readable JSON"},
     {"print-enums", PrintEnums, "Print enum values for a class"},
diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
index 1dd34f2a077bc..0428322e07e3e 100644
--- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
@@ -62,6 +62,32 @@ static_library("AST") {
     "AttrDocTable.cpp",
     "AttrImpl.cpp",
     "Availability.cpp",
+    "ByteCode/ByteCodeEmitter.cpp",
+    "ByteCode/Compiler.cpp",
+    "ByteCode/Context.cpp",
+    "ByteCode/Descriptor.cpp",
+    "ByteCode/Disasm.cpp",
+    "ByteCode/DynamicAllocator.cpp",
+    "ByteCode/EvalEmitter.cpp",
+    "ByteCode/EvaluationResult.cpp",
+    "ByteCode/Floating.cpp",
+    "ByteCode/Frame.cpp",
+    "ByteCode/Function.cpp",
+    "ByteCode/FunctionPointer.cpp",
+    "ByteCode/Interp.cpp",
+    "ByteCode/InterpBlock.cpp",
+    "ByteCode/InterpBuiltin.cpp",
+    "ByteCode/InterpFrame.cpp",
+    "ByteCode/InterpShared.cpp",
+    "ByteCode/InterpStack.cpp",
+    "ByteCode/InterpState.cpp",
+    "ByteCode/MemberPointer.cpp",
+    "ByteCode/Pointer.cpp",
+    "ByteCode/PrimType.cpp",
+    "ByteCode/Program.cpp",
+    "ByteCode/Record.cpp",
+    "ByteCode/Source.cpp",
+    "ByteCode/State.cpp",
     "CXXInheritance.cpp",
     "Comment.cpp",
     "CommentBriefParser.cpp",
@@ -92,31 +118,6 @@ static_library("AST") {
     "ExternalASTSource.cpp",
     "FormatString.cpp",
     "InheritViz.cpp",
-    "ByteCode/ByteCodeEmitter.cpp",
-    "ByteCode/Compiler.cpp",
-    "ByteCode/Context.cpp",
-    "ByteCode/Descriptor.cpp",
-    "ByteCode/Disasm.cpp",
-    "ByteCode/DynamicAllocator.cpp",
-    "ByteCode/EvalEmitter.cpp",
-    "ByteCode/EvaluationResult.cpp",
-    "ByteCode/Floating.cpp",
-    "ByteCode/Frame.cpp",
-    "ByteCode/Function.cpp",
-    "ByteCode/Interp.cpp",
-    "ByteCode/InterpBlock.cpp",
-    "ByteCode/InterpBuiltin.cpp",
-    "ByteCode/InterpFrame.cpp",
-    "ByteCode/InterpShared.cpp",
-    "ByteCode/InterpStack.cpp",
-    "ByteCode/InterpState.cpp",
-    "ByteCode/MemberPointer.cpp",
-    "ByteCode/Pointer.cpp",
-    "ByteCode/PrimType.cpp",
-    "ByteCode/Program.cpp",
-    "ByteCode/Record.cpp",
-    "ByteCode/Source.cpp",
-    "ByteCode/State.cpp",
     "ItaniumCXXABI.cpp",
     "ItaniumMangle.cpp",
     "JSONNodeDumper.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index 8cb60fd81840f..b627b1ecc2548 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -45,6 +45,7 @@ executable("lldb-dap") {
     "FifoFiles.cpp",
     "FunctionBreakpoint.cpp",
     "IOStream.cpp",
+    "InstructionBreakpoint.cpp",
     "JSONUtils.cpp",
     "LLDBUtils.cpp",
     "OutputRedirector.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 006e1ed700b82..dd4af4e98832f 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -166,7 +166,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPULowerModuleLDSPass.cpp",
     "AMDGPUMCInstLower.cpp",
     "AMDGPUMIRFormatter.cpp",
-    "AMDGPUMachineCFGStructurizer.cpp",
     "AMDGPUMachineFunction.cpp",
     "AMDGPUMachineModuleInfo.cpp",
     "AMDGPUMacroFusion.cpp",
@@ -189,6 +188,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUSetWavePriority.cpp",
     "AMDGPUSplitModule.cpp",
     "AMDGPUSubtarget.cpp",
+    "AMDGPUSwLowerLDS.cpp",
     "AMDGPUTargetMachine.cpp",
     "AMDGPUTargetObjectFile.cpp",
     "AMDGPUTargetTransformInfo.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
index 83b55be3f9208..4ebeaff28c14a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
@@ -68,7 +68,6 @@ static_library("LLVMPowerPCCodeGen") {
     "PPCCallingConv.cpp",
     "PPCEarlyReturn.cpp",
     "PPCExpandAtomicPseudoInsts.cpp",
-    "PPCExpandISEL.cpp",
     "PPCFastISel.cpp",
     "PPCFrameLowering.cpp",
     "PPCGenScalarMASSEntries.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index 58acabf85d296..55e25e0fe5b79 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -24,6 +24,7 @@ static_library("Utils") {
     "CodeExtractor.cpp",
     "CodeLayout.cpp",
     "CodeMoverUtils.cpp",
+    "ControlFlowUtils.cpp",
     "CountVisits.cpp",
     "CtorUtils.cpp",
     "DXILUpgrade.cpp",
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 4dad1412436d9..e3ca9b1fb32cc 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -356,7 +356,7 @@ def executeBuiltinPopd(cmd, shenv):
 def executeBuiltinExport(cmd, shenv):
     """executeBuiltinExport - Set an environment variable."""
     if len(cmd.args) != 2:
-        raise InternalShellError("'export' supports only one argument")
+        raise InternalShellError(cmd, "'export' supports only one argument")
     updateEnv(shenv, cmd.args)
     return ShellCommandResult(cmd, "", "", 0, False)
 
diff --git a/llvm/utils/lit/tests/Inputs/shtest-export/export-too-many-args.txt b/llvm/utils/lit/tests/Inputs/shtest-export/export-too-many-args.txt
new file mode 100644
index 0000000000000..b282e1a176498
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-export/export-too-many-args.txt
@@ -0,0 +1,2 @@
+## Test export command with too many arguments.
+# RUN: export FOO=1 BAR=2
diff --git a/llvm/utils/lit/tests/Inputs/shtest-export/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-export/lit.cfg
new file mode 100644
index 0000000000000..22ddf13ea3857
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-export/lit.cfg
@@ -0,0 +1,7 @@
+import lit.formats
+
+config.name = "shtest-export"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
diff --git a/llvm/utils/lit/tests/shtest-export.py b/llvm/utils/lit/tests/shtest-export.py
new file mode 100644
index 0000000000000..f2de8e8cd8b5f
--- /dev/null
+++ b/llvm/utils/lit/tests/shtest-export.py
@@ -0,0 +1,12 @@
+## Test the export command.
+
+# RUN: not %{lit} -a -v %{inputs}/shtest-export \
+# RUN: | FileCheck -match-full-lines %s
+#
+# END.
+
+# CHECK: FAIL: shtest-export :: export-too-many-args.txt {{.*}}
+# CHECK: export FOO=1 BAR=2
+# CHECK: # executed command: export FOO=1 BAR=2
+# CHECK: # | 'export' supports only one argument
+# CHECK: # error: command failed with exit status: {{.*}}
diff --git a/mlir/docs/Dialects/emitc.md b/mlir/docs/Dialects/emitc.md
index 4b0394606e4a2..743d70959f3d8 100644
--- a/mlir/docs/Dialects/emitc.md
+++ b/mlir/docs/Dialects/emitc.md
@@ -12,6 +12,10 @@ The following convention is followed:
     operation, C++20 is required.
 *   If `ssize_t` is used, then the code requires the POSIX header `sys/types.h`
     or any of the C++ headers in which the type is defined.
+*   If `_Float16` is used, the code requires the support of C additional
+    floating types.
+*   If `__bf16` is used, the code requires a compiler that supports it, such as 
+    GCC or Clang.
 *   Else the generated code is compatible with C99.
 
 These restrictions are neither inherent to the EmitC dialect itself nor to the
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index 631b564618320..5eb96a86e472d 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -316,7 +316,8 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDISubprogramAttrGet(
     MlirContext ctx, MlirAttribute id, MlirAttribute compileUnit,
     MlirAttribute scope, MlirAttribute name, MlirAttribute linkageName,
     MlirAttribute file, unsigned int line, unsigned int scopeLine,
-    uint64_t subprogramFlags, MlirAttribute type);
+    uint64_t subprogramFlags, MlirAttribute type, intptr_t nRetainedNodes,
+    MlirAttribute const *retainedNodes);
 
 /// Gets the scope from this DISubprogramAttr.
 MLIR_CAPI_EXPORTED MlirAttribute
@@ -353,6 +354,12 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIModuleAttrGet(
     MlirAttribute name, MlirAttribute configMacros, MlirAttribute includePath,
     MlirAttribute apinotes, unsigned int line, bool isDecl);
 
+/// Creates a LLVM DIImportedEntityAttr attribute.
+MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIImportedEntityAttrGet(
+    MlirContext ctx, unsigned int tag, MlirAttribute entity, MlirAttribute file,
+    unsigned int line, MlirAttribute name, intptr_t nElements,
+    MlirAttribute const *elements);
+
 /// Gets the scope of this DIModuleAttr.
 MLIR_CAPI_EXPORTED MlirAttribute
 mlirLLVMDIModuleAttrGetScope(MlirAttribute diModule);
diff --git a/mlir/include/mlir/Bindings/Python/IRTypes.h b/mlir/include/mlir/Bindings/Python/IRTypes.h
new file mode 100644
index 0000000000000..9afad4c23b3f3
--- /dev/null
+++ b/mlir/include/mlir/Bindings/Python/IRTypes.h
@@ -0,0 +1,31 @@
+//===- IRTypes.h - Type Interfaces ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_BINDINGS_PYTHON_IRTYPES_H
+#define MLIR_BINDINGS_PYTHON_IRTYPES_H
+
+#include "mlir/Bindings/Python/PybindAdaptors.h"
+
+namespace mlir {
+
+/// Shaped Type Interface - ShapedType
+class PyShapedType : public python::PyConcreteType<PyShapedType> {
+public:
+  static const IsAFunctionTy isaFunction;
+  static constexpr const char *pyClassName = "ShapedType";
+  using PyConcreteType::PyConcreteType;
+
+  static void bindDerived(ClassTy &c);
+
+private:
+  void requireHasRank();
+};
+
+} // namespace mlir
+
+#endif // MLIR_BINDINGS_PYTHON_IRTYPES_H
diff --git a/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h b/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
index 78c79c915e060..28fdc234e5ef0 100644
--- a/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
+++ b/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
@@ -9,7 +9,9 @@
 #ifndef MLIR_CONVERSION_ARITHTOAMDGPU_ARITHTOAMDGPU_H
 #define MLIR_CONVERSION_ARITHTOAMDGPU_ARITHTOAMDGPU_H
 
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include <memory>
+#include <string>
 
 namespace mlir {
 
@@ -26,7 +28,10 @@ namespace arith {
 /// to the largest value of that type instead of being rewritten to Inf (aka
 /// NaN).
 void populateArithToAMDGPUConversionPatterns(RewritePatternSet &patterns,
-                                             bool saturateFP8TruncF);
+                                             bool convertFP8Arithmetic,
+                                             bool saturateFP8Truncf,
+                                             bool allowPackedF16Rtz,
+                                             amdgpu::Chipset chipset);
 } // namespace arith
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 7bde9e490e4f4..383e7dca0429c 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -150,9 +150,15 @@ def ArithToAMDGPUConversionPass : Pass<"convert-arith-to-amdgpu"> {
   let dependentDialects = ["amdgpu::AMDGPUDialect", "vector::VectorDialect"];
 
   let options = [
+    Option<"chipset", "chipset", "std::string",
+                        /*default=*/"\"gfx000\"",
+                        "Chipset that these operations will run on">,
     Option<"saturateFP8Truncf", "saturate-fp8-truncf", "bool",
            /*default=*/"false",
            "Use saturating truncation for 8-bit float types">,
+    Option<"allowPackedF16Rtz", "allow-packed-f16-round-to-zero", "bool",
+           /*default=*/"false",
+           "Whether we should allow f32->f16 packed round-to-zero conversion">,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 97e0580c89808..e5c1a53f34bf6 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -25,6 +25,7 @@ def AMDGPU_Dialect : Dialect {
 
 
   let dependentDialects = [
+    "ROCDL::ROCDLDialect",
     "arith::ArithDialect",
     "gpu::GPUDialect"
   ];
diff --git a/mlir/include/mlir/Dialect/DLTI/DLTI.h b/mlir/include/mlir/Dialect/DLTI/DLTI.h
index a97eb523cb063..f268fea340a6f 100644
--- a/mlir/include/mlir/Dialect/DLTI/DLTI.h
+++ b/mlir/include/mlir/Dialect/DLTI/DLTI.h
@@ -26,7 +26,7 @@ namespace mlir {
 namespace dlti {
 /// Perform a DLTI-query at `op`, recursively querying each key of `keys` on
 /// query interface-implementing attrs, starting from attr obtained from `op`.
-FailureOr<Attribute> query(Operation *op, ArrayRef<StringAttr> keys,
+FailureOr<Attribute> query(Operation *op, ArrayRef<DataLayoutEntryKey> keys,
                            bool emitError = false);
 } // namespace dlti
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.td b/mlir/include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.td
index 1b1bebfaab4e3..f25bb383912d4 100644
--- a/mlir/include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.td
+++ b/mlir/include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.td
@@ -26,9 +26,10 @@ def QueryOp : Op<Transform_Dialect, "dlti.query", [
 
     A lookup is performed for the given `keys` at `target` op - or its closest
     interface-implementing ancestor - by way of the `DLTIQueryInterface`, which
-    returns an attribute for a key. If more than one key is provided, the lookup
-    continues recursively, now on the returned attributes, with the condition
-    that these implement the above interface. For example if the payload IR is
+    returns an attribute for a key. Each key should be either a (quoted) string
+    or a type. If more than one key is provided, the lookup continues
+    recursively, now on the returned attributes, with the condition that these
+    implement the above interface. For example if the payload IR is
 
     ```
     module attributes {#dlti.map = #dlti.map<#dlti.dl_entry<"A",
@@ -52,7 +53,7 @@ def QueryOp : Op<Transform_Dialect, "dlti.query", [
   }];
 
   let arguments = (ins TransformHandleTypeInterface:$target,
-                       StrArrayAttr:$keys);
+                       ArrayAttr:$keys);
   let results = (outs TransformParamTypeInterface:$associated_attr);
   let assemblyFormat =
       "$keys `at` $target attr-dict `:` functional-type(operands, results)";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index 5d96f50634258..e57be7f760d38 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -565,19 +565,21 @@ def LLVM_DISubprogramAttr : LLVM_Attr<"DISubprogram", "di_subprogram",
     OptionalParameter<"unsigned">:$line,
     OptionalParameter<"unsigned">:$scopeLine,
     OptionalParameter<"DISubprogramFlags">:$subprogramFlags,
-    OptionalParameter<"DISubroutineTypeAttr">:$type
+    OptionalParameter<"DISubroutineTypeAttr">:$type,
+    OptionalArrayRefParameter<"DINodeAttr">:$retainedNodes
   );
   let builders = [
     AttrBuilderWithInferredContext<(ins
       "DistinctAttr":$id, "DICompileUnitAttr":$compileUnit,
       "DIScopeAttr":$scope, "StringRef":$name, "StringRef":$linkageName,
       "DIFileAttr":$file, "unsigned":$line, "unsigned":$scopeLine,
-      "DISubprogramFlags":$subprogramFlags, "DISubroutineTypeAttr":$type
+      "DISubprogramFlags":$subprogramFlags, "DISubroutineTypeAttr":$type,
+      "ArrayRef<DINodeAttr>":$retainedNodes
     ), [{
       MLIRContext *ctx = file.getContext();
       return $_get(ctx, id, compileUnit, scope, StringAttr::get(ctx, name),
                    StringAttr::get(ctx, linkageName), file, line,
-                   scopeLine, subprogramFlags, type);
+                   scopeLine, subprogramFlags, type, retainedNodes);
     }]>
   ];
 
@@ -619,6 +621,29 @@ def LLVM_DINamespaceAttr : LLVM_Attr<"DINamespace", "di_namespace",
   let assemblyFormat = "`<` struct(params) `>`";
 }
 
+//===----------------------------------------------------------------------===//
+// DIImportedEntityAttr
+//===----------------------------------------------------------------------===//
+
+def LLVM_DIImportedEntityAttr : LLVM_Attr<"DIImportedEntity", "di_imported_entity",
+                                           /*traits=*/[], "DINodeAttr"> {
+  /// TODO: DIImportedEntity has a 'scope' field which represents the scope where
+  /// this entity is imported. Currently, we are not adding a 'scope' field in
+  /// DIImportedEntityAttr to avoid cyclic dependency. As DIImportedEntityAttr
+  /// entries will be contained inside a scope entity (e.g. DISubprogramAttr),
+  /// the scope can easily be inferred.
+  let parameters = (ins
+    LLVM_DITagParameter:$tag,
+    "DINodeAttr":$entity,
+    OptionalParameter<"DIFileAttr">:$file,
+    OptionalParameter<"unsigned">:$line,
+    OptionalParameter<"StringAttr">:$name,
+    OptionalArrayRefParameter<"DINodeAttr">:$elements
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // DISubrangeAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index a1e6fc3e29900..e832dfa9d6b80 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -166,7 +166,7 @@ def ROCDL_BallotOp :
   let summary = "Vote across thread group";
 
   let description = [{
-      Ballot provides a bit mask containing the 1-bit predicate value from each lane. 
+      Ballot provides a bit mask containing the 1-bit predicate value from each lane.
       The nth bit of the result contains the 1 bit contributed by the nth warp lane.
   }];
 
@@ -579,6 +579,21 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
   }];
 }
 
+//===---------------------------------------------------------------------===//
+// 16-bit float intrinsics
+//===---------------------------------------------------------------------===//
+def ROCDL_CvtPkRtz:
+    ROCDL_IntrOp<"cvt.pkrtz", [], [], [Pure], 1>,
+    Arguments<(ins F32:$srcA, F32:$srcB)> {
+  let summary = "Convert two f32 input into a vector<2xf16>";
+  let description = [{
+    Convert two f32 values into a packed vector<2xf16>.
+  }];
+  let assemblyFormat = [{
+    attr-dict $srcA `,` $srcB `:` type($res)
+  }];
+}
+
 //===---------------------------------------------------------------------===//
 // 8-bit float intrinsics
 //===---------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
index 08afdf373f014..0fcaa96ade403 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
@@ -110,8 +110,12 @@ struct ConvolutionDimensions {
 FailureOr<ConvolutionDimensions> inferConvolutionDims(LinalgOp linalgOp);
 
 /// Checks whether `linalgOp` conforms to ConvolutionOpInterface.
+/// By default, we require the `linalgOp` to have non-empty convolved dims
+/// (implicitly non-empty `output_image` and `filter_loop`).
+/// Users can loosen the constraint by setting `allowEmptyConvolvedDims` to true
 // TODO: embed within `isa<ConvolutionOpInterface>` if possible / natural.
-bool isaConvolutionOpInterface(LinalgOp linalgOp);
+bool isaConvolutionOpInterface(LinalgOp linalgOp,
+                               bool allowEmptyConvolvedDims = false);
 
 /// Checks whether `linalgOp` is semantically equivalent to a `linalg.copyOp`.
 bool isaCopyOpInterface(LinalgOp linalgOp);
@@ -175,9 +179,12 @@ enum class MatchConvolutionResult;
 /// Checks whether `op` conforms to ConvolutionOpInterface and populates
 /// `dimensions` with indexes of the different kinds of dimensions when
 /// present.
+/// If `allowEmptyConvolvedDims` is not set, we further checks whether the `op`
+/// contains convolved dims.
 MatchConvolutionResult
 isConvolutionInterfaceImpl(Operation *op,
-                           ConvolutionDimensions *dimensions = nullptr);
+                           ConvolutionDimensions *dimensions = nullptr,
+                           bool allowEmptyConvolvedDims = false);
 
 /// Returns the error message corresponding to the convolution checking return
 /// code.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
index 90021ffa7c380..efbe5c56a219b 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -162,7 +162,7 @@ struct SparsifierOptions : public PassPipelineOptions<SparsifierOptions> {
   }
 
   /// Projects out the options for `createConvertVectorToLLVMPass`.
-  ConvertVectorToLLVMPassOptions lowerVectorToLLVMOptions() const {
+  ConvertVectorToLLVMPassOptions convertVectorToLLVMOptions() const {
     ConvertVectorToLLVMPassOptions opts{};
     opts.reassociateFPReductions = reassociateFPReductions;
     opts.force32BitVectorIndices = force32BitVectorIndices;
diff --git a/mlir/include/mlir/Support/ThreadLocalCache.h b/mlir/include/mlir/Support/ThreadLocalCache.h
index 87cc52cc56ac4..53b6d31a09555 100644
--- a/mlir/include/mlir/Support/ThreadLocalCache.h
+++ b/mlir/include/mlir/Support/ThreadLocalCache.h
@@ -27,6 +27,8 @@ template <typename ValueT>
 class ThreadLocalCache {
   struct PerInstanceState;
 
+  using PointerAndFlag = std::pair<ValueT *, std::atomic<bool>>;
+
   /// The "observer" is owned by a thread-local cache instance. It is
   /// constructed the first time a `ThreadLocalCache` instance is accessed by a
   /// thread, unless `perInstanceState` happens to get re-allocated to the same
@@ -41,7 +43,8 @@ class ThreadLocalCache {
     /// This is the double pointer, explicitly allocated because we need to keep
     /// the address stable if the TLC map re-allocates. It is owned by the
     /// observer and shared with the value owner.
-    std::shared_ptr<ValueT *> ptr = std::make_shared<ValueT *>(nullptr);
+    std::shared_ptr<PointerAndFlag> ptr =
+        std::make_shared<PointerAndFlag>(std::make_pair(nullptr, false));
     /// Because the `Owner` instance that lives inside `PerInstanceState`
     /// contains a reference to the double pointer, and likewise this class
     /// contains a reference to the value, we need to synchronize destruction of
@@ -62,18 +65,21 @@ class ThreadLocalCache {
     /// Save a pointer to the reference and write it to the newly created entry.
     Owner(Observer &observer)
         : value(std::make_unique<ValueT>()), ptrRef(observer.ptr) {
-      *observer.ptr = value.get();
+      observer.ptr->second = true;
+      observer.ptr->first = value.get();
     }
     ~Owner() {
-      if (std::shared_ptr<ValueT *> ptr = ptrRef.lock())
-        *ptr = nullptr;
+      if (std::shared_ptr<PointerAndFlag> ptr = ptrRef.lock()) {
+        ptr->first = nullptr;
+        ptr->second = false;
+      }
     }
 
     Owner(Owner &&) = default;
     Owner &operator=(Owner &&) = default;
 
     std::unique_ptr<ValueT> value;
-    std::weak_ptr<ValueT *> ptrRef;
+    std::weak_ptr<PointerAndFlag> ptrRef;
   };
 
   // Keep a separate shared_ptr protected state that can be acquired atomically
@@ -116,7 +122,7 @@ class ThreadLocalCache {
       // back to the data here that is being destroyed.
       for (auto &[instance, observer] : *this)
         if (std::shared_ptr<PerInstanceState> state = observer.keepalive.lock())
-          state->remove(*observer.ptr);
+          state->remove(observer.ptr->first);
     }
 
     /// Clear out any unused entries within the map. This method is not
@@ -124,7 +130,7 @@ class ThreadLocalCache {
     void clearExpiredEntries() {
       for (auto it = this->begin(), e = this->end(); it != e;) {
         auto curIt = it++;
-        if (!*curIt->second.ptr)
+        if (!curIt->second.ptr->second)
           this->erase(curIt);
       }
     }
@@ -142,7 +148,7 @@ class ThreadLocalCache {
     // Check for an already existing instance for this thread.
     CacheType &staticCache = getStaticCache();
     Observer &threadInstance = staticCache[perInstanceState.get()];
-    if (ValueT *value = *threadInstance.ptr)
+    if (ValueT *value = threadInstance.ptr->first)
       return *value;
 
     // Otherwise, create a new instance for this thread.
@@ -157,7 +163,7 @@ class ThreadLocalCache {
     // entries in the static map. The cache is only cleared within the same
     // thread to remove the need to lock the cache itself.
     staticCache.clearExpiredEntries();
-    return **threadInstance.ptr;
+    return *threadInstance.ptr->first;
   }
   ValueT &operator*() { return get(); }
   ValueT *operator->() { return &get(); }
diff --git a/mlir/lib/Bindings/Python/IRTypes.cpp b/mlir/lib/Bindings/Python/IRTypes.cpp
index c3d42c0ef8e3c..2ee1d89c38884 100644
--- a/mlir/lib/Bindings/Python/IRTypes.cpp
+++ b/mlir/lib/Bindings/Python/IRTypes.cpp
@@ -10,6 +10,8 @@
 
 #include "PybindUtils.h"
 
+#include "mlir/Bindings/Python/IRTypes.h"
+
 #include "mlir-c/BuiltinAttributes.h"
 #include "mlir-c/BuiltinTypes.h"
 #include "mlir-c/Support.h"
@@ -418,98 +420,98 @@ class PyComplexType : public PyConcreteType<PyComplexType> {
   }
 };
 
-class PyShapedType : public PyConcreteType<PyShapedType> {
-public:
-  static constexpr IsAFunctionTy isaFunction = mlirTypeIsAShaped;
-  static constexpr const char *pyClassName = "ShapedType";
-  using PyConcreteType::PyConcreteType;
+} // namespace
 
-  static void bindDerived(ClassTy &c) {
-    c.def_property_readonly(
-        "element_type",
-        [](PyShapedType &self) { return mlirShapedTypeGetElementType(self); },
-        "Returns the element type of the shaped type.");
-    c.def_property_readonly(
-        "has_rank",
-        [](PyShapedType &self) -> bool { return mlirShapedTypeHasRank(self); },
-        "Returns whether the given shaped type is ranked.");
-    c.def_property_readonly(
-        "rank",
-        [](PyShapedType &self) {
-          self.requireHasRank();
-          return mlirShapedTypeGetRank(self);
-        },
-        "Returns the rank of the given ranked shaped type.");
-    c.def_property_readonly(
-        "has_static_shape",
-        [](PyShapedType &self) -> bool {
-          return mlirShapedTypeHasStaticShape(self);
-        },
-        "Returns whether the given shaped type has a static shape.");
-    c.def(
-        "is_dynamic_dim",
-        [](PyShapedType &self, intptr_t dim) -> bool {
-          self.requireHasRank();
-          return mlirShapedTypeIsDynamicDim(self, dim);
-        },
-        py::arg("dim"),
-        "Returns whether the dim-th dimension of the given shaped type is "
-        "dynamic.");
-    c.def(
-        "get_dim_size",
-        [](PyShapedType &self, intptr_t dim) {
-          self.requireHasRank();
-          return mlirShapedTypeGetDimSize(self, dim);
-        },
-        py::arg("dim"),
-        "Returns the dim-th dimension of the given ranked shaped type.");
-    c.def_static(
-        "is_dynamic_size",
-        [](int64_t size) -> bool { return mlirShapedTypeIsDynamicSize(size); },
-        py::arg("dim_size"),
-        "Returns whether the given dimension size indicates a dynamic "
-        "dimension.");
-    c.def(
-        "is_dynamic_stride_or_offset",
-        [](PyShapedType &self, int64_t val) -> bool {
-          self.requireHasRank();
-          return mlirShapedTypeIsDynamicStrideOrOffset(val);
-        },
-        py::arg("dim_size"),
-        "Returns whether the given value is used as a placeholder for dynamic "
-        "strides and offsets in shaped types.");
-    c.def_property_readonly(
-        "shape",
-        [](PyShapedType &self) {
-          self.requireHasRank();
-
-          std::vector<int64_t> shape;
-          int64_t rank = mlirShapedTypeGetRank(self);
-          shape.reserve(rank);
-          for (int64_t i = 0; i < rank; ++i)
-            shape.push_back(mlirShapedTypeGetDimSize(self, i));
-          return shape;
-        },
-        "Returns the shape of the ranked shaped type as a list of integers.");
-    c.def_static(
-        "get_dynamic_size", []() { return mlirShapedTypeGetDynamicSize(); },
-        "Returns the value used to indicate dynamic dimensions in shaped "
-        "types.");
-    c.def_static(
-        "get_dynamic_stride_or_offset",
-        []() { return mlirShapedTypeGetDynamicStrideOrOffset(); },
-        "Returns the value used to indicate dynamic strides or offsets in "
-        "shaped types.");
-  }
+// Shaped Type Interface - ShapedType
+void mlir::PyShapedType::bindDerived(ClassTy &c) {
+  c.def_property_readonly(
+      "element_type",
+      [](PyShapedType &self) { return mlirShapedTypeGetElementType(self); },
+      "Returns the element type of the shaped type.");
+  c.def_property_readonly(
+      "has_rank",
+      [](PyShapedType &self) -> bool { return mlirShapedTypeHasRank(self); },
+      "Returns whether the given shaped type is ranked.");
+  c.def_property_readonly(
+      "rank",
+      [](PyShapedType &self) {
+        self.requireHasRank();
+        return mlirShapedTypeGetRank(self);
+      },
+      "Returns the rank of the given ranked shaped type.");
+  c.def_property_readonly(
+      "has_static_shape",
+      [](PyShapedType &self) -> bool {
+        return mlirShapedTypeHasStaticShape(self);
+      },
+      "Returns whether the given shaped type has a static shape.");
+  c.def(
+      "is_dynamic_dim",
+      [](PyShapedType &self, intptr_t dim) -> bool {
+        self.requireHasRank();
+        return mlirShapedTypeIsDynamicDim(self, dim);
+      },
+      py::arg("dim"),
+      "Returns whether the dim-th dimension of the given shaped type is "
+      "dynamic.");
+  c.def(
+      "get_dim_size",
+      [](PyShapedType &self, intptr_t dim) {
+        self.requireHasRank();
+        return mlirShapedTypeGetDimSize(self, dim);
+      },
+      py::arg("dim"),
+      "Returns the dim-th dimension of the given ranked shaped type.");
+  c.def_static(
+      "is_dynamic_size",
+      [](int64_t size) -> bool { return mlirShapedTypeIsDynamicSize(size); },
+      py::arg("dim_size"),
+      "Returns whether the given dimension size indicates a dynamic "
+      "dimension.");
+  c.def(
+      "is_dynamic_stride_or_offset",
+      [](PyShapedType &self, int64_t val) -> bool {
+        self.requireHasRank();
+        return mlirShapedTypeIsDynamicStrideOrOffset(val);
+      },
+      py::arg("dim_size"),
+      "Returns whether the given value is used as a placeholder for dynamic "
+      "strides and offsets in shaped types.");
+  c.def_property_readonly(
+      "shape",
+      [](PyShapedType &self) {
+        self.requireHasRank();
+
+        std::vector<int64_t> shape;
+        int64_t rank = mlirShapedTypeGetRank(self);
+        shape.reserve(rank);
+        for (int64_t i = 0; i < rank; ++i)
+          shape.push_back(mlirShapedTypeGetDimSize(self, i));
+        return shape;
+      },
+      "Returns the shape of the ranked shaped type as a list of integers.");
+  c.def_static(
+      "get_dynamic_size", []() { return mlirShapedTypeGetDynamicSize(); },
+      "Returns the value used to indicate dynamic dimensions in shaped "
+      "types.");
+  c.def_static(
+      "get_dynamic_stride_or_offset",
+      []() { return mlirShapedTypeGetDynamicStrideOrOffset(); },
+      "Returns the value used to indicate dynamic strides or offsets in "
+      "shaped types.");
+}
 
-private:
-  void requireHasRank() {
-    if (!mlirShapedTypeHasRank(*this)) {
-      throw py::value_error(
-          "calling this method requires that the type has a rank.");
-    }
+void mlir::PyShapedType::requireHasRank() {
+  if (!mlirShapedTypeHasRank(*this)) {
+    throw py::value_error(
+        "calling this method requires that the type has a rank.");
   }
-};
+}
+
+const mlir::PyShapedType::IsAFunctionTy mlir::PyShapedType::isaFunction =
+    mlirTypeIsAShaped;
+
+namespace {
 
 /// Vector Type subclass - VectorType.
 class PyVectorType : public PyConcreteType<PyVectorType, PyShapedType> {
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index 03e2f2be2156a..13341f0c4de88 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -293,14 +293,20 @@ MlirAttribute mlirLLVMDISubprogramAttrGet(
     MlirContext ctx, MlirAttribute id, MlirAttribute compileUnit,
     MlirAttribute scope, MlirAttribute name, MlirAttribute linkageName,
     MlirAttribute file, unsigned int line, unsigned int scopeLine,
-    uint64_t subprogramFlags, MlirAttribute type) {
+    uint64_t subprogramFlags, MlirAttribute type, intptr_t nRetainedNodes,
+    MlirAttribute const *retainedNodes) {
+  SmallVector<Attribute> nodesStorage;
+  nodesStorage.reserve(nRetainedNodes);
   return wrap(DISubprogramAttr::get(
       unwrap(ctx), cast<DistinctAttr>(unwrap(id)),
       cast<DICompileUnitAttr>(unwrap(compileUnit)),
       cast<DIScopeAttr>(unwrap(scope)), cast<StringAttr>(unwrap(name)),
       cast<StringAttr>(unwrap(linkageName)), cast<DIFileAttr>(unwrap(file)),
       line, scopeLine, DISubprogramFlags(subprogramFlags),
-      cast<DISubroutineTypeAttr>(unwrap(type))));
+      cast<DISubroutineTypeAttr>(unwrap(type)),
+      llvm::map_to_vector(
+          unwrapList(nRetainedNodes, retainedNodes, nodesStorage),
+          [](Attribute a) { return cast<DINodeAttr>(a); })));
 }
 
 MlirAttribute mlirLLVMDISubprogramAttrGetScope(MlirAttribute diSubprogram) {
@@ -345,3 +351,16 @@ MlirAttribute mlirLLVMDIModuleAttrGet(MlirContext ctx, MlirAttribute file,
 MlirAttribute mlirLLVMDIModuleAttrGetScope(MlirAttribute diModule) {
   return wrap(cast<DIModuleAttr>(unwrap(diModule)).getScope());
 }
+
+MlirAttribute mlirLLVMDIImportedEntityAttrGet(
+    MlirContext ctx, unsigned int tag, MlirAttribute entity, MlirAttribute file,
+    unsigned int line, MlirAttribute name, intptr_t nElements,
+    MlirAttribute const *elements) {
+  SmallVector<Attribute> elementsStorage;
+  elementsStorage.reserve(nElements);
+  return wrap(DIImportedEntityAttr::get(
+      unwrap(ctx), tag, cast<DINodeAttr>(unwrap(entity)),
+      cast<DIFileAttr>(unwrap(file)), line, cast<StringAttr>(unwrap(name)),
+      llvm::map_to_vector(unwrapList(nElements, elements, elementsStorage),
+                          [](Attribute a) { return cast<DINodeAttr>(a); })));
+}
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index b3798a3f7624b..d36583c8118ff 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -9,8 +9,11 @@
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
 
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
@@ -24,6 +27,7 @@ namespace mlir {
 } // namespace mlir
 
 using namespace mlir;
+using namespace mlir::amdgpu;
 
 namespace {
 struct ArithToAMDGPUConversionPass final
@@ -43,12 +47,25 @@ struct ExtFOnFloat8RewritePattern final : OpRewritePattern<arith::ExtFOp> {
 
 struct TruncFToFloat8RewritePattern final : OpRewritePattern<arith::TruncFOp> {
   bool saturateFP8 = false;
-  TruncFToFloat8RewritePattern(MLIRContext *ctx, bool saturateFP8)
-      : OpRewritePattern::OpRewritePattern(ctx), saturateFP8(saturateFP8) {}
+  TruncFToFloat8RewritePattern(MLIRContext *ctx, bool saturateFP8,
+                               Chipset chipset)
+      : OpRewritePattern::OpRewritePattern(ctx), saturateFP8(saturateFP8),
+        chipset(chipset) {}
+  Chipset chipset;
 
   LogicalResult match(arith::TruncFOp op) const override;
   void rewrite(arith::TruncFOp op, PatternRewriter &rewriter) const override;
 };
+
+struct TruncfToFloat16RewritePattern final
+    : public OpRewritePattern<arith::TruncFOp> {
+
+  using OpRewritePattern<arith::TruncFOp>::OpRewritePattern;
+
+  LogicalResult match(arith::TruncFOp op) const override;
+  void rewrite(arith::TruncFOp op, PatternRewriter &rewriter) const override;
+};
+
 } // end namespace
 
 static Value castF32To(Type elementType, Value f32, Location loc,
@@ -272,17 +289,105 @@ void TruncFToFloat8RewritePattern::rewrite(arith::TruncFOp op,
   rewriter.replaceOp(op, result);
 }
 
+LogicalResult TruncfToFloat16RewritePattern::match(arith::TruncFOp op) const {
+  Type outType = op.getOut().getType();
+  Type inputType = getElementTypeOrSelf(op.getIn());
+  if (auto outVecType = dyn_cast<VectorType>(outType)) {
+    if (outVecType.isScalable())
+      return failure();
+    outType = outVecType.getElementType();
+  }
+  return success(outType.isF16() && inputType.isF32());
+}
+
+void TruncfToFloat16RewritePattern::rewrite(arith::TruncFOp op,
+                                            PatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  Value in = op.getIn();
+  Type outElemType = getElementTypeOrSelf(op.getOut().getType());
+  VectorType truncResType = VectorType::get(2, outElemType);
+  auto inVectorTy = dyn_cast<VectorType>(in.getType());
+
+  // Handle the case where input type is not a vector type
+  if (!inVectorTy) {
+    auto sourceB = rewriter.create<LLVM::PoisonOp>(loc, rewriter.getF32Type());
+    Value asF16s =
+        rewriter.create<ROCDL::CvtPkRtz>(loc, truncResType, in, sourceB);
+    Value result = rewriter.create<vector::ExtractElementOp>(
+        loc, asF16s, rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0));
+    return rewriter.replaceOp(op, result);
+  }
+  VectorType outType = cast<VectorType>(op.getOut().getType());
+  int64_t numElements = outType.getNumElements();
+  Value zero = rewriter.createOrFold<arith::ConstantOp>(
+      loc, outElemType, rewriter.getFloatAttr(outElemType, 0.0));
+  Value result = rewriter.createOrFold<vector::SplatOp>(loc, outType, zero);
+
+  if (inVectorTy.getRank() > 1) {
+    inVectorTy = VectorType::get(SmallVector<int64_t>{numElements},
+                                 inVectorTy.getElementType());
+    in = rewriter.create<vector::ShapeCastOp>(loc, inVectorTy, in);
+  }
+
+  // Handle the vector case. We also handle the (uncommon) case where the vector
+  // length is odd
+  for (int64_t i = 0; i < numElements; i += 2) {
+    int64_t elemsThisOp = std::min(numElements, i + 2) - i;
+    Value thisResult = nullptr;
+    Value elemA = rewriter.create<vector::ExtractElementOp>(
+        loc, in, rewriter.create<arith::ConstantIndexOp>(loc, i));
+    Value elemB = rewriter.create<LLVM::PoisonOp>(loc, rewriter.getF32Type());
+
+    if (elemsThisOp == 2) {
+      elemB = rewriter.create<vector::ExtractElementOp>(
+          loc, in, rewriter.createOrFold<arith::ConstantIndexOp>(loc, i + 1));
+    }
+
+    thisResult =
+        rewriter.create<ROCDL::CvtPkRtz>(loc, truncResType, elemA, elemB);
+    // Place back the truncated result into the possibly larger vector. If we
+    // are operating on a size 2 vector, these operations should be folded away
+    thisResult = rewriter.create<vector::ExtractStridedSliceOp>(
+        loc, thisResult, 0, elemsThisOp, 1);
+    result = rewriter.create<vector::InsertStridedSliceOp>(loc, thisResult,
+                                                           result, i, 1);
+  }
+
+  if (inVectorTy.getRank() != outType.getRank()) {
+    result = rewriter.create<vector::ShapeCastOp>(loc, outType, result);
+  }
+
+  rewriter.replaceOp(op, result);
+}
+
 void mlir::arith::populateArithToAMDGPUConversionPatterns(
-    RewritePatternSet &patterns, bool saturateFP8TruncF) {
-  patterns.add<ExtFOnFloat8RewritePattern>(patterns.getContext());
-  patterns.add<TruncFToFloat8RewritePattern>(patterns.getContext(),
-                                             saturateFP8TruncF);
+    RewritePatternSet &patterns, bool convertFP8Arithmetic,
+    bool saturateFP8Truncf, bool allowPackedF16Rtz, Chipset chipset) {
+
+  if (convertFP8Arithmetic) {
+    patterns.add<ExtFOnFloat8RewritePattern>(patterns.getContext());
+    patterns.add<TruncFToFloat8RewritePattern>(patterns.getContext(),
+                                               saturateFP8Truncf, chipset);
+  }
+  if (allowPackedF16Rtz)
+    patterns.add<TruncfToFloat16RewritePattern>(patterns.getContext());
 }
 
 void ArithToAMDGPUConversionPass::runOnOperation() {
   Operation *op = getOperation();
+  MLIRContext *ctx = &getContext();
   RewritePatternSet patterns(op->getContext());
-  arith::populateArithToAMDGPUConversionPatterns(patterns, saturateFP8Truncf);
+  FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
+  if (failed(maybeChipset)) {
+    emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
+    return signalPassFailure();
+  }
+
+  bool convertFP8Arithmetic =
+      (*maybeChipset).majorVersion == 9 && (*maybeChipset).minorVersion >= 0x40;
+  arith::populateArithToAMDGPUConversionPatterns(
+      patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz,
+      *maybeChipset);
   if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
     return signalPassFailure();
 }
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt
index e2c951b0b34d8..50be09ab5a7c5 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt
+++ b/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt
@@ -12,6 +12,7 @@ add_mlir_conversion_library(MLIRArithToAMDGPU
 
   LINK_LIBS PUBLIC
   MLIRAMDGPUDialect
+  MLIRAMDGPUUtils
   MLIRArithDialect
   MLIRArithUtils
   MLIRVectorDialect
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
index 55143d5939ba2..842d239cf6a51 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp
@@ -35,8 +35,8 @@ using namespace mlir;
 using namespace mlir::vector;
 
 namespace {
-struct LowerVectorToLLVMPass
-    : public impl::ConvertVectorToLLVMPassBase<LowerVectorToLLVMPass> {
+struct ConvertVectorToLLVMPass
+    : public impl::ConvertVectorToLLVMPassBase<ConvertVectorToLLVMPass> {
 
   using Base::Base;
 
@@ -58,7 +58,7 @@ struct LowerVectorToLLVMPass
 };
 } // namespace
 
-void LowerVectorToLLVMPass::runOnOperation() {
+void ConvertVectorToLLVMPass::runOnOperation() {
   // Perform progressive lowering of operations on slices and
   // all contraction operations. Also applies folding and DCE.
   {
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index c1a785fb25478..3943696364950 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
diff --git a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
index 0551d13b5a0cf..78d78cf48a747 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_dialect_library(MLIRAMDGPUDialect
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
+  MLIRROCDLDialect
   # Needed for GPU address space enum definition
   MLIRGPUDialect
   MLIRIR
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
index 053ea7935260a..9fbe574ec392d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
@@ -258,20 +258,23 @@ struct CallOpInterface
         return failure();
       Value buffer = *maybeBuffer;
 
-      // Caller / callee type mismatch is handled with a CastOp.
+      // Caller / callee type mismatch is handled with castOrReallocMemRefValue.
       auto memRefType = funcType.getInput(opOperand.getOperandNumber());
       // Since we don't yet have a clear layout story, to_memref may
       // conservatively turn tensors into more dynamic memref than necessary.
       // If the memref type of the callee fails, introduce an extra memref.cast
       // that will either canonicalize away or fail compilation until we can do
-      // something better.
+      // something better. Insert a reallocation + copy if it cannot be
+      // statically guaranteed that a direct cast would be valid.
       if (buffer.getType() != memRefType) {
-        assert(
-            memref::CastOp::areCastCompatible(buffer.getType(), memRefType) &&
-            "CallOp::bufferize: cast incompatible");
-        Value castBuffer = rewriter.create<memref::CastOp>(callOp.getLoc(),
-                                                           memRefType, buffer);
-        buffer = castBuffer;
+        auto memrefDstType = dyn_cast<MemRefType>(memRefType);
+        assert(memrefDstType &&
+               "buffer layout not supported on unranked tensors");
+        FailureOr<Value> replacement = bufferization::castOrReallocMemRefValue(
+            rewriter, buffer, memrefDstType, options);
+        if (failed(replacement))
+          return failure();
+        buffer = *replacement;
       }
       newOperands.push_back(buffer);
     }
diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp
index 7f8e11a1b7334..85ec9fc93248a 100644
--- a/mlir/lib/Dialect/DLTI/DLTI.cpp
+++ b/mlir/lib/Dialect/DLTI/DLTI.cpp
@@ -424,8 +424,16 @@ getClosestQueryable(Operation *op) {
   return std::pair(queryable, op);
 }
 
-FailureOr<Attribute> dlti::query(Operation *op, ArrayRef<StringAttr> keys,
-                                 bool emitError) {
+FailureOr<Attribute>
+dlti::query(Operation *op, ArrayRef<DataLayoutEntryKey> keys, bool emitError) {
+  if (keys.empty()) {
+    if (emitError) {
+      auto diag = op->emitError() << "target op of failed DLTI query";
+      diag.attachNote(op->getLoc()) << "no keys provided to attempt query with";
+    }
+    return failure();
+  }
+
   auto [queryable, queryOp] = getClosestQueryable(op);
   Operation *reportOp = (queryOp ? queryOp : op);
 
@@ -438,6 +446,15 @@ FailureOr<Attribute> dlti::query(Operation *op, ArrayRef<StringAttr> keys,
     return failure();
   }
 
+  auto keyToStr = [](DataLayoutEntryKey key) -> std::string {
+    std::string buf;
+    llvm::TypeSwitch<DataLayoutEntryKey>(key)
+        .Case<StringAttr, Type>( // The only two kinds of key we know of.
+            [&](auto key) { llvm::raw_string_ostream(buf) << key; })
+        .Default([](auto) { llvm_unreachable("unexpected entry key kind"); });
+    return buf;
+  };
+
   Attribute currentAttr = queryable;
   for (auto &&[idx, key] : llvm::enumerate(keys)) {
     if (auto map = llvm::dyn_cast<DLTIQueryInterface>(currentAttr)) {
@@ -446,17 +463,24 @@ FailureOr<Attribute> dlti::query(Operation *op, ArrayRef<StringAttr> keys,
         if (emitError) {
           auto diag = op->emitError() << "target op of failed DLTI query";
           diag.attachNote(reportOp->getLoc())
-              << "key " << key << " has no DLTI-mapping per attr: " << map;
+              << "key " << keyToStr(key)
+              << " has no DLTI-mapping per attr: " << map;
         }
         return failure();
       }
       currentAttr = *maybeAttr;
     } else {
       if (emitError) {
+        std::string commaSeparatedKeys;
+        llvm::interleave(
+            keys.take_front(idx), // All prior keys.
+            [&](auto key) { commaSeparatedKeys += keyToStr(key); },
+            [&]() { commaSeparatedKeys += ","; });
+
         auto diag = op->emitError() << "target op of failed DLTI query";
         diag.attachNote(reportOp->getLoc())
             << "got non-DLTI-queryable attribute upon looking up keys ["
-            << keys.take_front(idx) << "] at op";
+            << commaSeparatedKeys << "] at op";
       }
       return failure();
     }
diff --git a/mlir/lib/Dialect/DLTI/TransformOps/DLTITransformOps.cpp b/mlir/lib/Dialect/DLTI/TransformOps/DLTITransformOps.cpp
index 90aef82bddff0..02c41b4fe8113 100644
--- a/mlir/lib/Dialect/DLTI/TransformOps/DLTITransformOps.cpp
+++ b/mlir/lib/Dialect/DLTI/TransformOps/DLTITransformOps.cpp
@@ -33,7 +33,16 @@ void transform::QueryOp::getEffects(
 DiagnosedSilenceableFailure transform::QueryOp::applyToOne(
     transform::TransformRewriter &rewriter, Operation *target,
     transform::ApplyToEachResultList &results, TransformState &state) {
-  auto keys = SmallVector<StringAttr>(getKeys().getAsRange<StringAttr>());
+  SmallVector<DataLayoutEntryKey> keys;
+  for (Attribute key : getKeys()) {
+    if (auto strKey = dyn_cast<StringAttr>(key))
+      keys.push_back(strKey);
+    else if (auto typeKey = dyn_cast<TypeAttr>(key))
+      keys.push_back(typeKey.getValue());
+    else
+      return emitDefiniteFailure("'transform.dlti.query' keys of wrong type: "
+                                 "only StringAttr and TypeAttr are allowed");
+  }
 
   FailureOr<Attribute> result = dlti::query(target, keys, /*emitError=*/true);
 
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index e6f1618cc2611..fdc21d6c6e24b 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -116,6 +116,11 @@ bool mlir::emitc::isIntegerIndexOrOpaqueType(Type type) {
 bool mlir::emitc::isSupportedFloatType(Type type) {
   if (auto floatType = llvm::dyn_cast<FloatType>(type)) {
     switch (floatType.getWidth()) {
+    case 16: {
+      if (llvm::isa<Float16Type, BFloat16Type>(type))
+        return true;
+      return false;
+    }
     case 32:
     case 64:
       return true;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index 963a4be25079e..98a9659735e7e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -58,10 +58,11 @@ void LLVMDialect::registerAttributes() {
 bool DINodeAttr::classof(Attribute attr) {
   return llvm::isa<DIBasicTypeAttr, DICompileUnitAttr, DICompositeTypeAttr,
                    DIDerivedTypeAttr, DIFileAttr, DIGlobalVariableAttr,
-                   DILabelAttr, DILexicalBlockAttr, DILexicalBlockFileAttr,
-                   DILocalVariableAttr, DIModuleAttr, DINamespaceAttr,
-                   DINullTypeAttr, DIStringTypeAttr, DISubprogramAttr,
-                   DISubrangeAttr, DISubroutineTypeAttr>(attr);
+                   DIImportedEntityAttr, DILabelAttr, DILexicalBlockAttr,
+                   DILexicalBlockFileAttr, DILocalVariableAttr, DIModuleAttr,
+                   DINamespaceAttr, DINullTypeAttr, DIStringTypeAttr,
+                   DISubprogramAttr, DISubrangeAttr, DISubroutineTypeAttr>(
+      attr);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/DIScopeForLLVMFuncOp.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/DIScopeForLLVMFuncOp.cpp
index 395ff6ed1e48e..758700c9272bc 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/DIScopeForLLVMFuncOp.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/DIScopeForLLVMFuncOp.cpp
@@ -79,7 +79,8 @@ static void addScopeToFunction(LLVM::LLVMFuncOp llvmFunc,
       context, id, compileUnitAttr, fileAttr, funcNameAttr, funcNameAttr,
       fileAttr,
       /*line=*/line,
-      /*scopeline=*/col, subprogramFlags, subroutineTypeAttr);
+      /*scopeline=*/col, subprogramFlags, subroutineTypeAttr,
+      /*retainedNodes=*/{});
   llvmFunc->setLoc(FusedLoc::get(context, {loc}, subprogramAttr));
 }
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 6ee1810c2ff2b..d5c21fb5d845e 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -762,13 +762,15 @@ enum class MatchConvolutionResult {
   NotProjectedPermutations,
   NonConvolutionLoop,
   OutputDimsNotParallel,
-  NonOutputDimNotReduction
+  NonOutputDimNotReduction,
+  EmptyConvolvedDims
 };
 } // namespace mlir::linalg::detail
 
 mlir::linalg::detail::MatchConvolutionResult
 mlir::linalg::detail::isConvolutionInterfaceImpl(
-    Operation *op, ConvolutionDimensions *dimensions) {
+    Operation *op, ConvolutionDimensions *dimensions,
+    bool allowEmptyConvolvedDims) {
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
   if (!linalgOp)
     return MatchConvolutionResult::NotLinalgOp;
@@ -886,10 +888,12 @@ mlir::linalg::detail::isConvolutionInterfaceImpl(
   if (allLoopDims.size() != linalgOp.getNumLoops())
     return MatchConvolutionResult::NonConvolutionLoop;
 
+  if (!allowEmptyConvolvedDims && inputExprWalker.convolvedDims.empty())
+    return MatchConvolutionResult::EmptyConvolvedDims;
+
   if (dimensions) {
-    FailureOr<ConvolutionDimensions> res =
-        inferConvolutionDimsImpl(linalgOp, inputExprWalker,
-                                 /*allowEmptyConvolvedDims=*/true);
+    FailureOr<ConvolutionDimensions> res = inferConvolutionDimsImpl(
+        linalgOp, inputExprWalker, allowEmptyConvolvedDims);
     assert(succeeded(res) && "unexpected failure to infer convolution dims");
     *dimensions = *res;
   }
@@ -914,14 +918,18 @@ mlir::linalg::detail::getMatchConvolutionMessage(MatchConvolutionResult res) {
     return "expected all iterators used to access outputs to be parallel";
   case MatchConvolutionResult::NonOutputDimNotReduction:
     return "expected all iterators not used to access outputs to be reduction";
+  case MatchConvolutionResult::EmptyConvolvedDims:
+    return "FIXME";
   case MatchConvolutionResult::Success:
     return "";
   }
   llvm_unreachable("unhandled MatchConvolutionResult case");
 }
 
-bool mlir::linalg::isaConvolutionOpInterface(LinalgOp linalgOp) {
-  return linalg::detail::isConvolutionInterfaceImpl(linalgOp.getOperation()) ==
+bool mlir::linalg::isaConvolutionOpInterface(LinalgOp linalgOp,
+                                             bool allowEmptyConvolvedDims) {
+  return linalg::detail::isConvolutionInterfaceImpl(
+             linalgOp.getOperation(), nullptr, allowEmptyConvolvedDims) ==
          linalg::detail::MatchConvolutionResult::Success;
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index c5eb965884396..12e330ac7efbd 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -76,16 +76,19 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
   pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
   pm.addPass(createLowerAffinePass());
-  pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
+  pm.addPass(
+      createConvertVectorToLLVMPass(options.convertVectorToLLVMOptions()));
   pm.addPass(createFinalizeMemRefToLLVMConversionPass());
   pm.addNestedPass<func::FuncOp>(createConvertComplexToStandardPass());
   pm.addNestedPass<func::FuncOp>(arith::createArithExpandOpsPass());
   pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
   pm.addPass(createConvertMathToLibmPass());
   pm.addPass(createConvertComplexToLibmPass());
-  pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
+  pm.addPass(
+      createConvertVectorToLLVMPass(options.convertVectorToLLVMOptions()));
   pm.addPass(createConvertComplexToLLVMPass());
-  pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
+  pm.addPass(
+      createConvertVectorToLLVMPass(options.convertVectorToLLVMOptions()));
   pm.addPass(createConvertFuncToLLVMPass());
 
   // Finalize GPU code generation.
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index c043582b7be9c..30657d8fccb15 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -1258,6 +1258,12 @@ LogicalResult CppEmitter::emitAttribute(Location loc, Attribute attr) {
       val.toString(strValue, 0, 0, false);
       os << strValue;
       switch (llvm::APFloatBase::SemanticsToEnum(val.getSemantics())) {
+      case llvm::APFloatBase::S_IEEEhalf:
+        os << "f16";
+        break;
+      case llvm::APFloatBase::S_BFloat:
+        os << "bf16";
+        break;
       case llvm::APFloatBase::S_IEEEsingle:
         os << "f";
         break;
@@ -1277,17 +1283,19 @@ LogicalResult CppEmitter::emitAttribute(Location loc, Attribute attr) {
 
   // Print floating point attributes.
   if (auto fAttr = dyn_cast<FloatAttr>(attr)) {
-    if (!isa<Float32Type, Float64Type>(fAttr.getType())) {
-      return emitError(loc,
-                       "expected floating point attribute to be f32 or f64");
+    if (!isa<Float16Type, BFloat16Type, Float32Type, Float64Type>(
+            fAttr.getType())) {
+      return emitError(
+          loc, "expected floating point attribute to be f16, bf16, f32 or f64");
     }
     printFloat(fAttr.getValue());
     return success();
   }
   if (auto dense = dyn_cast<DenseFPElementsAttr>(attr)) {
-    if (!isa<Float32Type, Float64Type>(dense.getElementType())) {
-      return emitError(loc,
-                       "expected floating point attribute to be f32 or f64");
+    if (!isa<Float16Type, BFloat16Type, Float32Type, Float64Type>(
+            dense.getElementType())) {
+      return emitError(
+          loc, "expected floating point attribute to be f16, bf16, f32 or f64");
     }
     os << '{';
     interleaveComma(dense, os, [&](const APFloat &val) { printFloat(val); });
@@ -1640,6 +1648,14 @@ LogicalResult CppEmitter::emitType(Location loc, Type type) {
   }
   if (auto fType = dyn_cast<FloatType>(type)) {
     switch (fType.getWidth()) {
+    case 16: {
+      if (llvm::isa<Float16Type>(type))
+        return (os << "_Float16"), success();
+      else if (llvm::isa<BFloat16Type>(type))
+        return (os << "__bf16"), success();
+      else
+        return emitError(loc, "cannot emit float type ") << type;
+    }
     case 32:
       return (os << "float"), success();
     case 64:
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index 1817c1271b43e..ce3643f513d34 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -208,6 +208,20 @@ DINamespaceAttr DebugImporter::translateImpl(llvm::DINamespace *node) {
                               node->getExportSymbols());
 }
 
+DIImportedEntityAttr
+DebugImporter::translateImpl(llvm::DIImportedEntity *node) {
+  SmallVector<DINodeAttr> elements;
+  for (llvm::DINode *element : node->getElements()) {
+    assert(element && "expected a non-null element type");
+    elements.push_back(translate(element));
+  }
+
+  return DIImportedEntityAttr::get(
+      context, node->getTag(), translate(node->getEntity()),
+      translate(node->getFile()), node->getLine(),
+      getStringAttrOrNull(node->getRawName()), elements);
+}
+
 DISubprogramAttr DebugImporter::translateImpl(llvm::DISubprogram *node) {
   // Only definitions require a distinct identifier.
   mlir::DistinctAttr id;
@@ -223,11 +237,17 @@ DISubprogramAttr DebugImporter::translateImpl(llvm::DISubprogram *node) {
   DISubroutineTypeAttr type = translate(node->getType());
   if (node->getType() && !type)
     return nullptr;
+
+  SmallVector<DINodeAttr> retainedNodes;
+  for (llvm::DINode *retainedNode : node->getRetainedNodes())
+    retainedNodes.push_back(translate(retainedNode));
+
   return DISubprogramAttr::get(context, id, translate(node->getUnit()), scope,
                                getStringAttrOrNull(node->getRawName()),
                                getStringAttrOrNull(node->getRawLinkageName()),
                                translate(node->getFile()), node->getLine(),
-                               node->getScopeLine(), *subprogramFlags, type);
+                               node->getScopeLine(), *subprogramFlags, type,
+                               retainedNodes);
 }
 
 DISubrangeAttr DebugImporter::translateImpl(llvm::DISubrange *node) {
@@ -308,6 +328,8 @@ DINodeAttr DebugImporter::translate(llvm::DINode *node) {
       return translateImpl(casted);
     if (auto *casted = dyn_cast<llvm::DIGlobalVariable>(node))
       return translateImpl(casted);
+    if (auto *casted = dyn_cast<llvm::DIImportedEntity>(node))
+      return translateImpl(casted);
     if (auto *casted = dyn_cast<llvm::DILabel>(node))
       return translateImpl(casted);
     if (auto *casted = dyn_cast<llvm::DILexicalBlock>(node))
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.h b/mlir/lib/Target/LLVMIR/DebugImporter.h
index 0e040891ba6c0..cb796676759c3 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.h
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.h
@@ -75,6 +75,7 @@ class DebugImporter {
   DIVariableAttr translateImpl(llvm::DIVariable *node);
   DIModuleAttr translateImpl(llvm::DIModule *node);
   DINamespaceAttr translateImpl(llvm::DINamespace *node);
+  DIImportedEntityAttr translateImpl(llvm::DIImportedEntity *node);
   DIScopeAttr translateImpl(llvm::DIScope *node);
   DISubprogramAttr translateImpl(llvm::DISubprogram *node);
   DISubrangeAttr translateImpl(llvm::DISubrange *node);
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
index 95b37e47d0461..042e015f107fe 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
@@ -306,6 +306,19 @@ llvm::DISubprogram *DebugTranslation::translateImpl(DISubprogramAttr attr) {
       static_cast<llvm::DISubprogram::DISPFlags>(attr.getSubprogramFlags()),
       compileUnit);
 
+  // DIImportedEntity requires scope information which DIImportedEntityAttr does
+  // not have. This is why we translate DIImportedEntityAttr after we have
+  // created DISubprogram as we can use it as the scope.
+  SmallVector<llvm::Metadata *> retainedNodes;
+  for (DINodeAttr nodeAttr : attr.getRetainedNodes()) {
+    if (auto importedAttr = dyn_cast<DIImportedEntityAttr>(nodeAttr)) {
+      llvm::DINode *dn = translate(importedAttr, node);
+      retainedNodes.push_back(dn);
+    }
+  }
+  if (!retainedNodes.empty())
+    node->replaceRetainedNodes(llvm::MDTuple::get(llvmCtx, retainedNodes));
+
   if (attr.getId())
     distinctAttrToNode.try_emplace(attr.getId(), node);
   return node;
@@ -326,6 +339,18 @@ llvm::DINamespace *DebugTranslation::translateImpl(DINamespaceAttr attr) {
                                 attr.getExportSymbols());
 }
 
+llvm::DIImportedEntity *DebugTranslation::translate(DIImportedEntityAttr attr,
+                                                    llvm::DIScope *scope) {
+  SmallVector<llvm::Metadata *> elements;
+  for (DINodeAttr member : attr.getElements())
+    elements.push_back(translate(member));
+
+  return llvm::DIImportedEntity::get(
+      llvmCtx, attr.getTag(), scope, translate(attr.getEntity()),
+      translate(attr.getFile()), attr.getLine(),
+      getMDStringOrNull(attr.getName()), llvm::MDNode::get(llvmCtx, elements));
+}
+
 llvm::DISubrange *DebugTranslation::translateImpl(DISubrangeAttr attr) {
   auto getMetadataOrNull = [&](Attribute attr) -> llvm::Metadata * {
     if (!attr)
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.h b/mlir/lib/Target/LLVMIR/DebugTranslation.h
index 16a853736226d..37b985acf8541 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.h
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.h
@@ -90,6 +90,12 @@ class DebugTranslation {
   llvm::DISubroutineType *translateImpl(DISubroutineTypeAttr attr);
   llvm::DIType *translateImpl(DITypeAttr attr);
 
+  /// Currently, DIImportedEntityAttr does not have a scope field to avoid a
+  /// cyclic dependency.  The scope information is obtained from the entity
+  /// which holds the list of DIImportedEntityAttr. This requires that scope
+  /// information be passed to translate function.
+  llvm::DIImportedEntity *translate(DIImportedEntityAttr attr, llvm::DIScope *);
+
   /// Attributes that support self recursion need to implement an additional
   /// method to hook into `translateRecursive`.
   /// - `<temp llvm type> translateTemporaryImpl(<mlir type>)`:
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 83a14290bcf64..6a32aeb444140 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -592,7 +592,20 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   return bodyGenStatus;
 }
 
+namespace {
+/// Contains the arguments for an LLVM store operation
+struct DeferredStore {
+  DeferredStore(llvm::Value *value, llvm::Value *address)
+      : value(value), address(address) {}
+
+  llvm::Value *value;
+  llvm::Value *address;
+};
+} // namespace
+
 /// Allocate space for privatized reduction variables.
+/// `deferredStores` contains information to create store operations which needs
+/// to be inserted after all allocas
 template <typename T>
 static LogicalResult
 allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
@@ -602,13 +615,13 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
                    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
                    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
                    DenseMap<Value, llvm::Value *> &reductionVariableMap,
+                   SmallVectorImpl<DeferredStore> &deferredStores,
                    llvm::ArrayRef<bool> isByRefs) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
   // delay creating stores until after all allocas
-  SmallVector<std::pair<llvm::Value *, llvm::Value *>> storesToCreate;
-  storesToCreate.reserve(loop.getNumReductionVars());
+  deferredStores.reserve(loop.getNumReductionVars());
 
   for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
     Region &allocRegion = reductionDecls[i].getAllocRegion();
@@ -628,7 +641,7 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
       // variable allocated in the inlined region)
       llvm::Value *var = builder.CreateAlloca(
           moduleTranslation.convertType(reductionDecls[i].getType()));
-      storesToCreate.emplace_back(phis[0], var);
+      deferredStores.emplace_back(phis[0], var);
 
       privateReductionVariables[i] = var;
       moduleTranslation.mapValue(reductionArgs[i], phis[0]);
@@ -644,10 +657,6 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
     }
   }
 
-  // TODO: further delay this so it doesn't come in the entry block at all
-  for (auto [data, addr] : storesToCreate)
-    builder.CreateStore(data, addr);
-
   return success();
 }
 
@@ -819,12 +828,19 @@ static LogicalResult allocAndInitializeReductionVars(
   if (op.getNumReductionVars() == 0)
     return success();
 
+  SmallVector<DeferredStore> deferredStores;
+
   if (failed(allocReductionVars(op, reductionArgs, builder, moduleTranslation,
                                 allocaIP, reductionDecls,
                                 privateReductionVariables, reductionVariableMap,
-                                isByRef)))
+                                deferredStores, isByRef)))
     return failure();
 
+  // store result of the alloc region to the allocated pointer to the real
+  // reduction variable
+  for (auto [data, addr] : deferredStores)
+    builder.CreateStore(data, addr);
+
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
@@ -1359,6 +1375,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   collectReductionDecls(opInst, reductionDecls);
   SmallVector<llvm::Value *> privateReductionVariables(
       opInst.getNumReductionVars());
+  SmallVector<DeferredStore> deferredStores;
 
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
     // Allocate reduction vars
@@ -1373,10 +1390,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
         InsertPointTy(allocaIP.getBlock(),
                       allocaIP.getBlock()->getTerminator()->getIterator());
 
-    if (failed(allocReductionVars(opInst, reductionArgs, builder,
-                                  moduleTranslation, allocaIP, reductionDecls,
-                                  privateReductionVariables,
-                                  reductionVariableMap, isByRef)))
+    if (failed(allocReductionVars(
+            opInst, reductionArgs, builder, moduleTranslation, allocaIP,
+            reductionDecls, privateReductionVariables, reductionVariableMap,
+            deferredStores, isByRef)))
       bodyGenStatus = failure();
 
     // Initialize reduction vars
@@ -1401,6 +1418,12 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
     builder.SetInsertPoint(initBlock->getFirstNonPHIOrDbgOrAlloca());
 
+    // insert stores deferred until after all allocas
+    // these store the results of the alloc region into the allocation for the
+    // pointer to the reduction variable
+    for (auto [data, addr] : deferredStores)
+      builder.CreateStore(data, addr);
+
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
 
diff --git a/mlir/python/mlir/runtime/np_to_memref.py b/mlir/python/mlir/runtime/np_to_memref.py
index 882b2751921bf..8cca1e7ad4a9e 100644
--- a/mlir/python/mlir/runtime/np_to_memref.py
+++ b/mlir/python/mlir/runtime/np_to_memref.py
@@ -37,6 +37,11 @@ class BF16(ctypes.Structure):
 
     _fields_ = [("bf16", ctypes.c_int16)]
 
+class F8E5M2(ctypes.Structure):
+    """A ctype representation for MLIR's Float8E5M2."""
+
+    _fields_ = [("f8E5M2", ctypes.c_int8)]
+
 
 # https://stackoverflow.com/questions/26921836/correct-way-to-test-for-numpy-dtype
 def as_ctype(dtp):
@@ -49,6 +54,8 @@ def as_ctype(dtp):
         return F16
     if ml_dtypes is not None and dtp == ml_dtypes.bfloat16:
         return BF16
+    if ml_dtypes is not None and dtp == ml_dtypes.float8_e5m2:
+        return F8E5M2
     return np.ctypeslib.as_ctypes_type(dtp)
 
 
@@ -65,6 +72,11 @@ def to_numpy(array):
     ), f"bfloat16 requires the ml_dtypes package, please run:\n\npip install ml_dtypes\n"
     if array.dtype == BF16:
         return array.view("bfloat16")
+    assert not (
+        array.dtype == F8E5M2 and ml_dtypes is None
+    ), f"float8_e5m2 requires the ml_dtypes package, please run:\n\npip install ml_dtypes\n"
+    if array.dtype == F8E5M2:
+        return array.view("float8_e5m2")
     return array
 
 
diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c
index d3054aa6a0d93..da28a96f89691 100644
--- a/mlir/test/CAPI/llvm.c
+++ b/mlir/test/CAPI/llvm.c
@@ -312,9 +312,15 @@ static void testDebugInfoAttributes(MlirContext ctx) {
   // CHECK: #llvm.di_subroutine_type<{{.*}}>
   mlirAttributeDump(subroutine_type);
 
-  MlirAttribute di_subprogram =
-      mlirLLVMDISubprogramAttrGet(ctx, id, compile_unit, compile_unit, foo, bar,
-                                  file, 1, 2, 0, subroutine_type);
+  MlirAttribute di_imported_entity = mlirLLVMDIImportedEntityAttrGet(
+      ctx, 0, di_module, file, 1, foo, 1, &local_var);
+
+  mlirAttributeDump(di_imported_entity);
+  // CHECK: #llvm.di_imported_entity<{{.*}}>
+
+  MlirAttribute di_subprogram = mlirLLVMDISubprogramAttrGet(
+      ctx, id, compile_unit, compile_unit, foo, bar, file, 1, 2, 0,
+      subroutine_type, 1, &di_imported_entity);
   // CHECK: #llvm.di_subprogram<{{.*}}>
   mlirAttributeDump(di_subprogram);
 
diff --git a/mlir/test/Conversion/ArithToAMDGPU/16-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/16-bit-floats.mlir
new file mode 100644
index 0000000000000..121cae26748a8
--- /dev/null
+++ b/mlir/test/Conversion/ArithToAMDGPU/16-bit-floats.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="allow-packed-f16-round-to-zero=true" | FileCheck %s
+
+// CHECK-LABEL: @scalar_trunc
+// CHECK-SAME: (%[[value:.*]]: f32)
+func.func @scalar_trunc(%v: f32) -> f16{
+  // CHECK: %[[poison:.*]] = llvm.mlir.poison : f32
+  // CHECK: %[[trunc:.*]] = rocdl.cvt.pkrtz %[[value]], %[[poison]] : vector<2xf16>
+  // CHECK: %[[extract:.*]] = vector.extractelement %[[trunc]][%c0 : index] : vector<2xf16>
+  // CHECK: return %[[extract]] : f16
+  %w = arith.truncf %v : f32 to f16
+  return %w : f16
+}
+
+// CHECK-LABEL: @vector_trunc
+// CHECK-SAME: (%[[value:.*]]: vector<2xf32>)
+func.func @vector_trunc_short(%v: vector<2xf32>) -> vector<2xf16> {
+  // CHECK: %[[elem0:.*]] = vector.extractelement %[[value]]
+  // CHECK: %[[elem1:.*]] = vector.extractelement %[[value]]
+  // CHECK: %[[ret:.*]] = rocdl.cvt.pkrtz %[[elem0]], %[[elem1]] : vector<2xf16>
+  // CHECK: return %[[ret]]
+  %w = arith.truncf %v : vector<2xf32> to vector<2xf16>
+  return %w : vector<2xf16>
+}
+
+// CHECK-LABEL:  @vector_trunc_long
+// CHECK-SAME: (%[[value:.*]]: vector<9xf32>)
+func.func @vector_trunc_long(%v: vector<9xf32>) -> vector<9xf16> {
+  // CHECK: %[[elem0:.*]] = vector.extractelement %[[value]][%c0 : index]
+  // CHECK: %[[elem1:.*]] = vector.extractelement %[[value]][%c1 : index]
+  // CHECK: %[[packed0:.*]] = rocdl.cvt.pkrtz %[[elem0]], %[[elem1]] : vector<2xf16>
+  // CHECK: %[[out0:.*]] = vector.insert_strided_slice %[[packed0]], {{.*}} {offsets = [0], strides = [1]} : vector<2xf16> into vector<9xf16>
+  // CHECK: %[[elem2:.*]] = vector.extractelement %[[value]][%c2 : index]
+  // CHECK: %[[elem3:.*]] = vector.extractelement %[[value]][%c3 : index]
+  // CHECK: %[[packed1:.*]] = rocdl.cvt.pkrtz %[[elem2]], %[[elem3]] : vector<2xf16>
+  // CHECK: %[[out1:.*]] = vector.insert_strided_slice %[[packed1]], %[[out0]] {offsets = [2], strides = [1]} : vector<2xf16> into vector<9xf16>
+  // CHECK: %[[elem4:.*]] = vector.extractelement %[[value]][%c4 : index]
+  // CHECK: %[[elem5:.*]] = vector.extractelement %[[value]][%c5 : index]
+  // CHECK: %[[packed2:.*]] = rocdl.cvt.pkrtz %[[elem4]], %[[elem5]] : vector<2xf16>
+  // CHECK: %[[out2:.*]] = vector.insert_strided_slice %[[packed2]], %[[out1]] {offsets = [4], strides = [1]} : vector<2xf16> into vector<9xf16>
+  // CHECK: %[[elem6:.*]] = vector.extractelement %[[value]]
+  // CHECK: %[[elem7:.*]] = vector.extractelement %[[value]]
+  // CHECK: %[[packed3:.*]] = rocdl.cvt.pkrtz %[[elem6]], %[[elem7]] : vector<2xf16>
+  // CHECK: %[[out3:.*]] = vector.insert_strided_slice %[[packed3]], %[[out2]] {offsets = [6], strides = [1]} : vector<2xf16> into vector<9xf16>
+  // CHECK: %[[elem8:.*]] = vector.extractelement %[[value]]
+  // CHECK: %[[packed4:.*]] = rocdl.cvt.pkrtz %[[elem8:.*]] : vector<2xf16>
+  // CHECK: %[[slice:.*]] = vector.extract_strided_slice %[[packed4]] {offsets = [0], sizes = [1], strides = [1]} : vector<2xf16> to vector<1xf16>
+  // CHECK: %[[out4:.*]] = vector.insert_strided_slice %[[slice]], %[[out3]] {offsets = [8], strides = [1]} : vector<1xf16> into vector<9xf16>
+  // CHECK: return %[[out4]]
+  %w = arith.truncf %v : vector<9xf32> to vector<9xf16>
+  return %w : vector<9xf16>
+}
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
index c7f39440a349b..cd921da2294e1 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt --split-input-file %s \
-// RUN:   --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{saturate-fp8-truncf=true}))' \
+// RUN:   --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \
 // RUN:   | FileCheck %s
 
 // CHECK-LABEL: func.func @scalar_trunc
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
index 26a222a4a788e..bd90facb61544 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu | FileCheck %s
+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s
 
 // CHECK-LABEL: func.func @scalar_ext
 // CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ)
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
index ef0e71ee8673b..b86690461dc26 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
@@ -15,36 +15,35 @@ func.func @arith_cast_vector(%arg0: vector<5xf32>) -> vector<5xi32> {
 }
 
 // -----
-
-func.func @arith_cast_bf16(%arg0: bf16) -> i32 {
+func.func @arith_cast_f80(%arg0: f80) -> i32 {
   // expected-error @+1 {{failed to legalize operation 'arith.fptosi'}}
-  %t = arith.fptosi %arg0 : bf16 to i32
+  %t = arith.fptosi %arg0 : f80 to i32
   return %t: i32
 }
 
 // -----
 
-func.func @arith_cast_f16(%arg0: f16) -> i32 {
+func.func @arith_cast_f128(%arg0: f128) -> i32 {
   // expected-error @+1 {{failed to legalize operation 'arith.fptosi'}}
-  %t = arith.fptosi %arg0 : f16 to i32
+  %t = arith.fptosi %arg0 : f128 to i32
   return %t: i32
 }
 
 
 // -----
 
-func.func @arith_cast_to_bf16(%arg0: i32) -> bf16 {
+func.func @arith_cast_to_f80(%arg0: i32) -> f80 {
   // expected-error @+1 {{failed to legalize operation 'arith.sitofp'}}
-  %t = arith.sitofp %arg0 : i32 to bf16
-  return %t: bf16
+  %t = arith.sitofp %arg0 : i32 to f80
+  return %t: f80
 }
 
 // -----
 
-func.func @arith_cast_to_f16(%arg0: i32) -> f16 {
+func.func @arith_cast_to_f128(%arg0: i32) -> f128 {
   // expected-error @+1 {{failed to legalize operation 'arith.sitofp'}}
-  %t = arith.sitofp %arg0 : i32 to f16
-  return %t: f16
+  %t = arith.sitofp %arg0 : i32 to f128
+  return %t: f128
 }
 
 // -----
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
index 836d8aedefc1f..dee9cc97a1449 100644
--- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
@@ -46,9 +46,9 @@ memref.global "nested" constant @nested_global : memref<3x7xf32>
 
 // -----
 
-func.func @unsupported_type_f16() {
+func.func @unsupported_type_f128() {
   // expected-error@+1 {{failed to legalize operation 'memref.alloca'}}
-  %0 = memref.alloca() : memref<4xf16>
+  %0 = memref.alloca() : memref<4xf128>
   return
 }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
index 0248afb11f167..0d5224514e3a0 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -71,6 +71,30 @@ func.func @return_extract_slice(%idx: index, %sz: index) -> (tensor<2x?xf32>)
 
 // -----
 
+// CHECK-NO-LAYOUT-MAP-LABEL:   func.func @foo(
+// CHECK-NO-LAYOUT-MAP-SAME:                   %[[VAL_0:.*]]: memref<3x8xf16>) -> memref<3x8xf16> {
+// CHECK-NO-LAYOUT-MAP:           return %[[VAL_0]] : memref<3x8xf16>
+// CHECK-NO-LAYOUT-MAP:         }
+func.func @foo(%arg0: tensor<3x8xf16>) -> tensor<3x8xf16> {
+  return %arg0 : tensor<3x8xf16>
+}
+
+// CHECK-NO-LAYOUT-MAP-LABEL:   func.func @call_extract_slice(
+// CHECK-NO-LAYOUT-MAP-SAME:                                  %[[VAL_0:.*]]: memref<4x8xf16>) -> memref<3x8xf16> {
+// CHECK-NO-LAYOUT-MAP:           %[[VAL_1:.*]] = memref.subview %[[VAL_0]][1, 0] [3, 8] [1, 1] : memref<4x8xf16> to memref<3x8xf16, strided<[8, 1], offset: 8>>
+// CHECK-NO-LAYOUT-MAP:           %[[VAL_2:.*]] = memref.alloc() {alignment = 64 : i64} : memref<3x8xf16>
+// CHECK-NO-LAYOUT-MAP:           memref.copy %[[VAL_1]], %[[VAL_2]] : memref<3x8xf16, strided<[8, 1], offset: 8>> to memref<3x8xf16>
+// CHECK-NO-LAYOUT-MAP:           %[[VAL_3:.*]] = call @foo(%[[VAL_2]]) : (memref<3x8xf16>) -> memref<3x8xf16>
+// CHECK-NO-LAYOUT-MAP:           return %[[VAL_3]] : memref<3x8xf16>
+// CHECK-NO-LAYOUT-MAP:         }
+func.func @call_extract_slice(%arg0: tensor<4x8xf16>) -> (tensor<3x8xf16>) {
+  %0 = tensor.extract_slice %arg0[1, 0] [3, 8] [1, 1] : tensor<4x8xf16> to tensor<3x8xf16>
+  %1 = call @foo(%0) : (tensor<3x8xf16>) -> tensor<3x8xf16>
+  return %1 : tensor<3x8xf16>
+}
+
+// -----
+
 // CHECK-LABEL: func private @private_func
 // CHECK-NO-LAYOUT-MAP-LABEL: func private @private_func(memref<?xf32>) -> f32
 func.func private @private_func(tensor<?xf32>) -> (f32)
diff --git a/mlir/test/Dialect/DLTI/invalid.mlir b/mlir/test/Dialect/DLTI/invalid.mlir
index 05f919fa25671..4b04f0195ef82 100644
--- a/mlir/test/Dialect/DLTI/invalid.mlir
+++ b/mlir/test/Dialect/DLTI/invalid.mlir
@@ -33,6 +33,14 @@
 
 // -----
 
+// expected-error@below {{repeated layout entry key: 'i32'}}
+"test.unknown_op"() { test.unknown_attr = #dlti.map<
+  #dlti.dl_entry<i32, 42>,
+  #dlti.dl_entry<i32, 42>
+>} : () -> ()
+
+// -----
+
 // expected-error@below {{repeated layout entry key: 'i32'}}
 "test.unknown_op"() { test.unknown_attr = #dlti.dl_spec<
   #dlti.dl_entry<i32, 42>,
diff --git a/mlir/test/Dialect/DLTI/query.mlir b/mlir/test/Dialect/DLTI/query.mlir
index 10e91afd2ca7e..a793c1a6e8e6a 100644
--- a/mlir/test/Dialect/DLTI/query.mlir
+++ b/mlir/test/Dialect/DLTI/query.mlir
@@ -17,6 +17,60 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// expected-remark @below {{i32 present in set : unit}}
+module attributes { test.dlti = #dlti.map<#dlti.dl_entry<i32, unit>>} {
+  func.func private @f()
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op
+    %module = transform.get_parent_op %funcs : (!transform.any_op) -> !transform.any_op
+    %param = transform.dlti.query [i32] at %module : (!transform.any_op) -> !transform.any_param
+    transform.debug.emit_param_as_remark %param, "i32 present in set :" at %module : !transform.any_param, !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// expected-remark @below {{associated attr 32 : i32}}
+module attributes { test.dlti = #dlti.map<#dlti.dl_entry<i32, #dlti.map<#dlti.dl_entry<"width_in_bits", 32 : i32>>>>} {
+  func.func private @f()
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op
+    %module = transform.get_parent_op %funcs : (!transform.any_op) -> !transform.any_op
+    %param = transform.dlti.query [i32,"width_in_bits"] at %module : (!transform.any_op) -> !transform.any_param
+    transform.debug.emit_param_as_remark %param, "associated attr" at %module : !transform.any_param, !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// expected-remark @below {{width in bits of i32 = 32 : i64}}
+// expected-remark @below {{width in bits of f64 = 64 : i64}}
+module attributes { test.dlti = #dlti.map<#dlti.dl_entry<"width_in_bits", #dlti.map<#dlti.dl_entry<i32, 32>, #dlti.dl_entry<f64, 64>>>>} {
+  func.func private @f()
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op
+    %module = transform.get_parent_op %funcs : (!transform.any_op) -> !transform.any_op
+    %i32bits = transform.dlti.query ["width_in_bits",i32] at %module : (!transform.any_op) -> !transform.any_param
+    %f64bits  = transform.dlti.query ["width_in_bits",f64] at %module : (!transform.any_op) -> !transform.any_param
+    transform.debug.emit_param_as_remark %i32bits, "width in bits of i32 =" at %module : !transform.any_param, !transform.any_op
+    transform.debug.emit_param_as_remark %f64bits, "width in bits of f64 =" at %module : !transform.any_param, !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 // expected-remark @below {{associated attr 42 : i32}}
 module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} {
   func.func private @f()
@@ -336,6 +390,23 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// expected-note @below {{got non-DLTI-queryable attribute upon looking up keys [i32]}}
+module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<i32, 32 : i32>>} {
+  // expected-error @below {{target op of failed DLTI query}}
+  func.func private @f()
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg: !transform.any_op) {
+    %func = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{'transform.dlti.query' op failed to apply}}
+    %param = transform.dlti.query [i32,"width_in_bits"] at %func : (!transform.any_op) -> !transform.any_param
+    transform.yield
+  }
+}
+
+// -----
+
 module {
   // expected-error @below {{target op of failed DLTI query}}
   // expected-note @below {{no DLTI-queryable attrs on target op or any of its ancestors}}
@@ -353,6 +424,55 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// expected-note @below {{key i64 has no DLTI-mapping per attr: #dlti.map<#dlti.dl_entry<i32, 32 : i64>>}}
+module attributes { test.dlti = #dlti.map<#dlti.dl_entry<"width_in_bits", #dlti.map<#dlti.dl_entry<i32, 32>>>>} {
+  // expected-error @below {{target op of failed DLTI query}}
+  func.func private @f()
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg: !transform.any_op) {
+    %func = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{'transform.dlti.query' op failed to apply}}
+    %param = transform.dlti.query ["width_in_bits",i64] at %func : (!transform.any_op) -> !transform.any_param
+    transform.yield
+  }
+}
+
+// -----
+
+module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} {
+  func.func private @f()
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{'transform.dlti.query' keys of wrong type: only StringAttr and TypeAttr are allowed}}
+    %param = transform.dlti.query [1] at %funcs : (!transform.any_op) -> !transform.param<i64>
+    transform.yield
+  }
+}
+
+// -----
+
+module attributes { test.dlti = #dlti.map<#dlti.dl_entry<"test.id", 42 : i32>>} {
+  // expected-error @below {{target op of failed DLTI query}}
+  // expected-note @below {{no keys provided to attempt query with}}
+  func.func private @f()
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg: !transform.any_op) {
+    %func = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{'transform.dlti.query' op failed to apply}}
+    %param = transform.dlti.query [] at %func : (!transform.any_op) -> !transform.any_param
+    transform.yield
+  }
+}
+
+// -----
+
 module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} {
   func.func private @f()
 }
diff --git a/mlir/test/Dialect/DLTI/valid.mlir b/mlir/test/Dialect/DLTI/valid.mlir
index 4133eac5424ce..31c925e5cb5be 100644
--- a/mlir/test/Dialect/DLTI/valid.mlir
+++ b/mlir/test/Dialect/DLTI/valid.mlir
@@ -206,3 +206,18 @@ module attributes {
     "GPU": #dlti.target_device_spec<
       #dlti.dl_entry<"L1_cache_size_in_bytes", "128">>
   >} {}
+
+
+// -----
+
+// CHECK: "test.op_with_dlti_map"() ({
+// CHECK: }) {dlti.map = #dlti.map<#dlti.dl_entry<"dlti.unknown_id", 42 : i64>>}
+"test.op_with_dlti_map"() ({
+}) { dlti.map = #dlti.map<#dlti.dl_entry<"dlti.unknown_id", 42>> } : () -> ()
+
+// -----
+
+// CHECK: "test.op_with_dlti_map"() ({
+// CHECK: }) {dlti.map = #dlti.map<#dlti.dl_entry<i32, 42 : i64>>}
+"test.op_with_dlti_map"() ({
+}) { dlti.map = #dlti.map<#dlti.dl_entry<i32, 42>> } : () -> ()
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 9c308cc010849..7ba55fc957a47 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -1019,10 +1019,10 @@ func.func @parallel_reduction_byref() {
 func.func @parallel_wsloop_reduction(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.parallel reduction(@add_f32 %{{.*}} -> %{{.+}} : !llvm.ptr) {
-  omp.parallel reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
-    // CHECK: omp.wsloop {
-    omp.wsloop {
+  // CHECK: omp.parallel {
+  omp.parallel {
+    // CHECK: omp.wsloop reduction(@add_f32 %{{.*}} -> %{{.+}} : !llvm.ptr) {
+    omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
       // CHECK: omp.loop_nest (%{{.+}}) : index = (%{{.+}}) to (%{{.+}}) step (%{{.+}}) {
       omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
         %1 = arith.constant 2.0 : f32
@@ -1216,10 +1216,10 @@ func.func @parallel_reduction2() {
 func.func @parallel_wsloop_reduction2(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
   %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
-  // CHECK: omp.parallel reduction(@add2_f32 %{{.*}} -> %{{.+}} : !llvm.ptr) {
-  omp.parallel reduction(@add2_f32 %0 -> %prv : !llvm.ptr) {
-    // CHECK: omp.wsloop {
-    omp.wsloop {
+  // CHECK: omp.parallel {
+  omp.parallel {
+    // CHECK: omp.wsloop reduction(@add2_f32 %{{.*}} -> %{{.+}} : !llvm.ptr) {
+    omp.wsloop reduction(@add2_f32 %0 -> %prv : !llvm.ptr) {
       // CHECK: omp.loop_nest (%{{.+}}) : index = (%{{.+}}) to (%{{.+}}) step (%{{.+}}) {
       omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
         %1 = arith.constant 2.0 : f32
diff --git a/mlir/test/Target/Cpp/const.mlir b/mlir/test/Target/Cpp/const.mlir
index 3658455d66943..d3656f830c48c 100644
--- a/mlir/test/Target/Cpp/const.mlir
+++ b/mlir/test/Target/Cpp/const.mlir
@@ -11,6 +11,8 @@ func.func @emitc_constant() {
   %c6 = "emitc.constant"(){value = 2 : index} : () -> index
   %c7 = "emitc.constant"(){value = 2.0 : f32} : () -> f32
   %f64 = "emitc.constant"(){value = 4.0 : f64} : () -> f64
+  %f16 = "emitc.constant"(){value = 2.0 : f16} : () -> f16
+  %bf16 = "emitc.constant"(){value = 4.0 : bf16} : () -> bf16
   %c8 = "emitc.constant"(){value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %c9 = "emitc.constant"(){value = dense<[0, 1]> : tensor<2xindex>} : () -> tensor<2xindex>
   %c10 = "emitc.constant"(){value = dense<[[0.0, 1.0], [2.0, 3.0]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
@@ -26,6 +28,8 @@ func.func @emitc_constant() {
 // CPP-DEFAULT-NEXT: size_t [[V6:[^ ]*]] = 2;
 // CPP-DEFAULT-NEXT: float [[V7:[^ ]*]] = 2.000000000e+00f;
 // CPP-DEFAULT-NEXT: double [[F64:[^ ]*]] = 4.00000000000000000e+00;
+// CPP-DEFAULT-NEXT: _Float16 [[F16:[^ ]*]] = 2.00000e+00f16;
+// CPP-DEFAULT-NEXT: __bf16 [[BF16:[^ ]*]] = 4.0000e+00bf16;
 // CPP-DEFAULT-NEXT: Tensor<int32_t> [[V8:[^ ]*]] = {0};
 // CPP-DEFAULT-NEXT: Tensor<size_t, 2> [[V9:[^ ]*]] = {0, 1};
 // CPP-DEFAULT-NEXT: Tensor<float, 2, 2> [[V10:[^ ]*]] = {0.0e+00f, 1.000000000e+00f, 2.000000000e+00f, 3.000000000e+00f};
@@ -40,6 +44,8 @@ func.func @emitc_constant() {
 // CPP-DECLTOP-NEXT: size_t [[V6:[^ ]*]];
 // CPP-DECLTOP-NEXT: float [[V7:[^ ]*]];
 // CPP-DECLTOP-NEXT: double [[F64:[^ ]*]];
+// CPP-DECLTOP-NEXT: _Float16 [[F16:[^ ]*]];
+// CPP-DECLTOP-NEXT: __bf16 [[BF16:[^ ]*]];
 // CPP-DECLTOP-NEXT: Tensor<int32_t> [[V8:[^ ]*]];
 // CPP-DECLTOP-NEXT: Tensor<size_t, 2> [[V9:[^ ]*]];
 // CPP-DECLTOP-NEXT: Tensor<float, 2, 2> [[V10:[^ ]*]];
@@ -52,6 +58,8 @@ func.func @emitc_constant() {
 // CPP-DECLTOP-NEXT: [[V6]] = 2;
 // CPP-DECLTOP-NEXT: [[V7]] = 2.000000000e+00f;
 // CPP-DECLTOP-NEXT: [[F64]] = 4.00000000000000000e+00;
+// CPP-DECLTOP-NEXT: [[F16]] = 2.00000e+00f16;
+// CPP-DECLTOP-NEXT: [[BF16]] = 4.0000e+00bf16;
 // CPP-DECLTOP-NEXT: [[V8]] = {0};
 // CPP-DECLTOP-NEXT: [[V9]] = {0, 1};
 // CPP-DECLTOP-NEXT: [[V10]] = {0.0e+00f, 1.000000000e+00f, 2.000000000e+00f, 3.000000000e+00f};
diff --git a/mlir/test/Target/Cpp/types.mlir b/mlir/test/Target/Cpp/types.mlir
index deda383b3b0a7..e7f935c737438 100644
--- a/mlir/test/Target/Cpp/types.mlir
+++ b/mlir/test/Target/Cpp/types.mlir
@@ -22,6 +22,10 @@ func.func @ptr_types() {
   emitc.call_opaque "f"() {template_args = [!emitc.ptr<i32>]} : () -> ()
   // CHECK-NEXT: f<int64_t*>();
   emitc.call_opaque "f"() {template_args = [!emitc.ptr<i64>]} : () -> ()
+  // CHECK-NEXT: f<_Float16*>();
+  emitc.call_opaque "f"() {template_args = [!emitc.ptr<f16>]} : () -> ()
+  // CHECK-NEXT: f<__bf16*>();
+  emitc.call_opaque "f"() {template_args = [!emitc.ptr<bf16>]} : () -> ()
   // CHECK-NEXT: f<float*>();
   emitc.call_opaque "f"() {template_args = [!emitc.ptr<f32>]} : () -> ()
   // CHECK-NEXT: f<double*>();
diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll
index 03c3855a9a324..bb03da37c0d09 100644
--- a/mlir/test/Target/LLVMIR/Import/debug-info.ll
+++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll
@@ -792,3 +792,28 @@ define void @string_type(ptr %arg1) {
 ; CHECK-SAME: stringLengthExp = <[DW_OP_push_object_address, DW_OP_plus_uconst(8)]>
 ; CHECK-SAME: stringLocationExp = <[DW_OP_push_object_address, DW_OP_deref]>>
 ; CHECK: #di_local_variable1 = #llvm.di_local_variable<scope = #di_subprogram, name = "str", file = #di_file, type = #di_string_type, flags = Artificial>
+
+; // -----
+
+; Test that imported entities for a functions are handled correctly.
+
+define void @imp_fn() !dbg !12 {
+  ret void
+}
+
+!llvm.module.flags = !{!10}
+!llvm.dbg.cu = !{!4}
+
+!2 = !DIModule(scope: !4, name: "mod1", file: !3, line: 1)
+!3 = !DIFile(filename: "test.f90", directory: "")
+!4 = distinct !DICompileUnit(language: DW_LANG_Fortran95, file: !3)
+!8 = !DIModule(scope: !4, name: "mod1", file: !3, line: 5)
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = distinct !DISubprogram(name: "imp_fn", linkageName: "imp_fn", scope: !3, file: !3, line: 10, type: !14, scopeLine: 10, spFlags: DISPFlagDefinition, unit: !4, retainedNodes: !16)
+!14 = !DISubroutineType(cc: DW_CC_program, types: !15)
+!15 = !{}
+!16 = !{!17}
+!17 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !12, entity: !8, file: !3, line: 1, elements: !15)
+
+; CHECK-DAG: #[[M:.+]] = #llvm.di_module<{{.*}}name = "mod1"{{.*}}>
+; CHECK-DAG: #[[SP:.+]] = #llvm.di_subprogram<{{.*}}name = "imp_fn"{{.*}}retainedNodes = #llvm.di_imported_entity<tag = DW_TAG_imported_module, entity = #[[M]]{{.*}}>>
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index 1a9a8561de00d..30b2ba5e9bad1 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -366,6 +366,36 @@ llvm.func @fn_with_gl() {
 
 // -----
 
+// Test that imported entries correctly generates 'retainedNodes' in the
+// subprogram.
+
+llvm.func @imp_fn() {
+  llvm.return
+} loc(#loc2)
+#file = #llvm.di_file<"test.f90" in "">
+#SP_TY = #llvm.di_subroutine_type<callingConvention = DW_CC_program>
+#CU = #llvm.di_compile_unit<id = distinct[0]<>,
+  sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false,
+  emissionKind = Full>
+#MOD = #llvm.di_module<file = #file, scope = #CU, name = "mod1">
+#MOD1 = #llvm.di_module<file = #file, scope = #CU, name = "mod2">
+#SP = #llvm.di_subprogram<id = distinct[1]<>, compileUnit = #CU, scope = #file,
+  name = "imp_fn", file = #file, subprogramFlags = Definition, type = #SP_TY,
+  retainedNodes = #llvm.di_imported_entity<tag = DW_TAG_imported_module,
+  entity = #MOD1, file = #file, line = 1>, #llvm.di_imported_entity<tag
+  = DW_TAG_imported_module, entity = #MOD, file = #file, line = 1>>
+#loc1 = loc("test.f90":12:14)
+#loc2 = loc(fused<#SP>[#loc1])
+
+// CHECK-DAG: ![[SP:[0-9]+]] = {{.*}}!DISubprogram(name: "imp_fn"{{.*}}retainedNodes: ![[NODES:[0-9]+]])
+// CHECK-DAG: ![[NODES]] = !{![[NODE2:[0-9]+]], ![[NODE1:[0-9]+]]}
+// CHECK-DAG: ![[NODE1]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: ![[SP]], entity: ![[MOD1:[0-9]+]]{{.*}})
+// CHECK-DAG: ![[NODE2]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: ![[SP]], entity: ![[MOD2:[0-9]+]]{{.*}})
+// CHECK-DAG: ![[MOD1]] = !DIModule({{.*}}name: "mod1"{{.*}})
+// CHECK-DAG: ![[MOD2]] = !DIModule({{.*}}name: "mod2"{{.*}})
+
+// -----
+
 // Nameless and scopeless global constant.
 
 // CHECK-LABEL: @.str.1 = external constant [10 x i8]
diff --git a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
index b06ad96f4592c..02ce6b5b19cea 100644
--- a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
@@ -156,3 +156,49 @@ llvm.func @foo()
 // CHECK:         %[[STR_LEN:.*]] = extractvalue { ptr, i64 } %{{.*}}, 1
 // CHECK:         %{{.*}} = alloca i8, i64 %[[STR_LEN]], align 1
 // CHECK:         call void @foo()
+
+// -----
+
+// Verifies fix for https://github.com/llvm/llvm-project/issues/102939.
+//
+// The issues occurs because the CodeExtractor component only collect inputs
+// (to the parallel regions) that are defined in the same function in which the
+// parallel regions is present. Howerver, this is problematic because if we are
+// privatizing a global value (e.g. a `target` variable which is emitted as a
+// global), then we miss finding that input and we do not privatize the
+// variable.
+
+omp.private {type = firstprivate} @global_privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "global", pinned} : (i64) -> !llvm.ptr
+  omp.yield(%1 : !llvm.ptr)
+} copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+  %0 = llvm.load %arg0 : !llvm.ptr -> f32
+  llvm.store %0, %arg1 : f32, !llvm.ptr
+  omp.yield(%arg1 : !llvm.ptr)
+}
+
+llvm.func @global_accessor() {
+  %global_addr = llvm.mlir.addressof @global : !llvm.ptr
+  omp.parallel private(@global_privatizer %global_addr -> %arg0 : !llvm.ptr) {
+    %1 = llvm.mlir.constant(3.140000e+00 : f32) : f32
+    llvm.store %1, %arg0 : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+llvm.mlir.global internal @global() {addr_space = 0 : i32} : f32 {
+  %0 = llvm.mlir.zero : f32
+  llvm.return %0 : f32
+}
+
+// CHECK-LABEL: @global_accessor..omp_par({{.*}})
+// CHECK-NEXT:  omp.par.entry:
+// Verify that we found the privatizer by checking that we properly inlined the
+// bodies of the alloc and copy regions.
+// CHECK:         %[[PRIV_ALLOC:.*]] = alloca float, i64 1, align 4
+// CHECK:         %[[GLOB_VAL:.*]] = load float, ptr @global, align 4
+// CHECK:         store float %[[GLOB_VAL]], ptr %[[PRIV_ALLOC]], align 4
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
index da6f943004612..2d8a13ccd2a1f 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-array-sections.mlir
@@ -89,7 +89,6 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:         %[[VAL_13:.*]] = load i32, ptr %[[VAL_10]], align 4
 // CHECK:         %[[VAL_20:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
 // CHECK:         %[[VAL_21:.*]] = alloca ptr, align 8
-// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
 // CHECK:         %[[VAL_14:.*]] = alloca [1 x ptr], align 8
 // CHECK:         br label %[[VAL_15:.*]]
 // CHECK:       omp.reduction.init:                               ; preds = %[[VAL_16:.*]]
@@ -98,6 +97,7 @@ llvm.func @sectionsreduction_(%arg0: !llvm.ptr {fir.bindc_name = "x"}) attribute
 // CHECK:         br label %[[VAL_18:.*]]
 // CHECK:       omp.par.region1:                                  ; preds = %[[VAL_17]]
 // CHECK:         %[[VAL_19:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK:         store ptr %[[VAL_20]], ptr %[[VAL_21]], align 8
 // CHECK:         br label %[[VAL_22:.*]]
 // CHECK:       omp_section_loop.preheader:                       ; preds = %[[VAL_18]]
 // CHECK:         store i32 0, ptr %[[VAL_7]], align 4
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
index bfdad8c19335e..1d4b4915bcc39 100644
--- a/mlir/test/Target/LLVMIR/openmp-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
@@ -540,8 +540,8 @@ llvm.func @parallel_nested_workshare_reduction(%ub : i64) {
   %lb = llvm.mlir.constant(1 : i64) : i64
   %step = llvm.mlir.constant(1 : i64) : i64
   
-  omp.parallel reduction(@add_i32 %0 -> %prv : !llvm.ptr) {
-    omp.wsloop {
+  omp.parallel {
+    omp.wsloop reduction(@add_i32 %0 -> %prv : !llvm.ptr) {
       omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
         %ival = llvm.trunc %iv : i64 to i32
         %lprv = llvm.load %prv : !llvm.ptr -> i32
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 64bcb5bdb255d..d902a82eeb9ea 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -530,6 +530,12 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
+llvm.func @rocdl_16bit_packed_floats(%sourceA: f32, %sourceB: f32) -> vector<2xf16> {
+  // CHECK: call <2 x half> @llvm.amdgcn.cvt.pkrtz(float {{.*}}, float {{.*}})
+  %source = rocdl.cvt.pkrtz %sourceA, %sourceB  : vector<2xf16>
+  llvm.return %source : vector<2xf16>
+}
+
 // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index 8125bf3fb8fc9..1cdda63eefe30 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -5,7 +5,7 @@
 from mlir.passmanager import *
 from mlir.execution_engine import *
 from mlir.runtime import *
-from ml_dtypes import bfloat16
+from ml_dtypes import bfloat16, float8_e5m2
 
 
 # Log everything to stderr and flush so that we have a unified stream to match
@@ -561,6 +561,45 @@ def testBF16Memref():
 run(testBF16Memref)
 
 
+# Test f8E5M2 memrefs
+# CHECK-LABEL: TEST: testF8E5M2Memref
+def testF8E5M2Memref():
+    with Context():
+        module = Module.parse(
+            """
+    module  {
+      func.func @main(%arg0: memref<1xf8E5M2>,
+                      %arg1: memref<1xf8E5M2>) attributes { llvm.emit_c_interface } {
+        %0 = arith.constant 0 : index
+        %1 = memref.load %arg0[%0] : memref<1xf8E5M2>
+        memref.store %1, %arg1[%0] : memref<1xf8E5M2>
+        return
+      }
+    } """
+        )
+
+        arg1 = np.array([0.5]).astype(float8_e5m2)
+        arg2 = np.array([0.0]).astype(float8_e5m2)
+
+        arg1_memref_ptr = ctypes.pointer(
+            ctypes.pointer(get_ranked_memref_descriptor(arg1))
+        )
+        arg2_memref_ptr = ctypes.pointer(
+            ctypes.pointer(get_ranked_memref_descriptor(arg2))
+        )
+
+        execution_engine = ExecutionEngine(lowerToLLVM(module))
+        execution_engine.invoke("main", arg1_memref_ptr, arg2_memref_ptr)
+
+        # test to-numpy utility
+        # CHECK: [0.5]
+        npout = ranked_memref_to_numpy(arg2_memref_ptr[0])
+        log(npout)
+
+
+run(testF8E5M2Memref)
+
+
 #  Test addition of two 2d_memref
 # CHECK-LABEL: TEST: testDynamicMemrefAdd2D
 def testDynamicMemrefAdd2D():
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index ddb08f12f0497..866bd5ed6fd3e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1534,6 +1534,7 @@ cc_library(
         ":BytecodeOpInterface",
         ":GPUDialect",
         ":IR",
+        ":ROCDLDialect",
         ":SideEffectInterfaces",
         "//llvm:Support",
     ],
@@ -8581,11 +8582,14 @@ cc_library(
     includes = ["include"],
     deps = [
         ":AMDGPUDialect",
+        ":AMDGPUUtils",
         ":ArithDialect",
         ":ArithUtils",
         ":ConversionPassIncGen",
         ":IR",
+        ":LLVMDialect",
         ":Pass",
+        ":ROCDLDialect",
         ":Support",
         ":TransformUtils",
         ":VectorDialect",